[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: kernel/137: vm hangs doing (while true; do grep xyzzy /dev/zero & done)



Could you please review these patches carefully, they solve PR# 137 on my
machine (i386, OpenBSD 2.8). How I tested them is explained below.
I can't test them on other architectures.

Mathias Schmocker <smat_(_at_)_acm_(_dot_)_org>

--- output of cvs diff -c *.h and *.c in /sys/uvm
Index: uvm.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm.h,v
retrieving revision 1.5
diff -c -r1.5 uvm.h
*** uvm.h	2000/05/27 21:06:08	1.5
--- uvm.h	2000/12/30 18:37:39
***************
*** 110,115 ****
--- 110,123 ----
  
  	/* kernel object: to support anonymous pageable kernel memory */
  	struct uvm_object *kernel_object;
+ 
+ #ifdef UVM_LOG_CHOKE
+ 	/* Data sent to log(9) after surviving a memory and swap exhaustion. */
+ 	pid_t	mem_hog_pid;			/* Process identifier. */
+ 	uid_t	mem_hog_ruid;			/* Real user id. */
+ 	gid_t	mem_hog_rgid;			/* Real group id. */
+ 	char	mem_hog_name[MAXCOMLEN+1];	/* Copy of p->p_comm */
+ #endif
  };
  
  extern struct uvm uvm;
Index: uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.5
diff -c -r1.5 uvm_fault.c
*** uvm_fault.c	2000/03/16 22:11:04	1.5
--- uvm_fault.c	2000/12/30 18:37:55
***************
*** 1171,1181 ****
  				uvm_anfree(anon);
  			uvmfault_unlockall(&ufi, amap, uobj, oanon);
  #ifdef DIAGNOSTIC
! 			if (uvmexp.swpgonly > uvmexp.swpages) {
! 				panic("uvmexp.swpgonly botch");
  			}
  #endif
! 			if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
  				UVMHIST_LOG(maphist,
  				    "<- failed.  out of VM",0,0,0,0);
  				uvmexp.fltnoanon++;
--- 1171,1184 ----
  				uvm_anfree(anon);
  			uvmfault_unlockall(&ufi, amap, uobj, oanon);
  #ifdef DIAGNOSTIC
! 			if (uvmexp.swpginuse > uvmexp.swpages) {
! 				panic("uvmexp.swpginuse botch");
  			}
  #endif
! 			/* XXX handle n swap devices better */
! 			if (anon == NULL ||
! 			    uvmexp.nswapdev * atop(round_page(USPACE)) >=
! 			    uvmexp.swpages - uvmexp.swpginuse) {
  				UVMHIST_LOG(maphist,
  				    "<- failed.  out of VM",0,0,0,0);
  				uvmexp.fltnoanon++;
***************
*** 1576,1586 ****
  			/* unlock and fail ... */
  			uvmfault_unlockall(&ufi, amap, uobj, NULL);
  #ifdef DIAGNOSTIC
! 			if (uvmexp.swpgonly > uvmexp.swpages) {
! 				panic("uvmexp.swpgonly botch");
  			}
  #endif
! 			if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
  				UVMHIST_LOG(maphist, "  promote: out of VM",
  				    0,0,0,0);
  				uvmexp.fltnoanon++;
--- 1579,1592 ----
  			/* unlock and fail ... */
  			uvmfault_unlockall(&ufi, amap, uobj, NULL);
  #ifdef DIAGNOSTIC
! 			if (uvmexp.swpginuse > uvmexp.swpages) {
! 				panic("uvmexp.swpginuse botch");
  			}
  #endif
! 			/* XXX handle n swap devices better */
! 			if (anon == NULL ||
! 			    uvmexp.nswapdev * atop(round_page(USPACE)) >=
! 			    uvmexp.swpages - uvmexp.swpginuse) {
  				UVMHIST_LOG(maphist, "  promote: out of VM",
  				    0,0,0,0);
  				uvmexp.fltnoanon++;
Index: uvm_glue.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_glue.c,v
retrieving revision 1.8
diff -c -r1.8 uvm_glue.c
*** uvm_glue.c	2000/09/07 20:15:28	1.8
--- uvm_glue.c	2000/12/30 18:37:58
***************
*** 76,81 ****
--- 76,82 ----
  #include <sys/resourcevar.h>
  #include <sys/buf.h>
  #include <sys/user.h>
+ #include <sys/signalvar.h>
  #ifdef SYSVSHM
  #include <sys/shm.h>
  #endif
***************
*** 502,508 ****
  {
  	register struct proc *p;
  	struct proc *outp, *outp2;
! 	int outpri, outpri2;
  	int didswap = 0;
  	extern int maxslp; 
  	/* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */
--- 503,510 ----
  {
  	register struct proc *p;
  	struct proc *outp, *outp2;
! 	long outpri, outpri2;
! 	int s;
  	int didswap = 0;
  	extern int maxslp; 
  	/* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */
***************
*** 515,525 ****
  	/*
  	 * outp/outpri  : stop/sleep process with largest sleeptime < maxslp
  	 * outp2/outpri2: the longest resident process (its swap time)
  	 */
  	outp = outp2 = NULL;
  	outpri = outpri2 = 0;
  	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
! 		if (!swappable(p))
  			continue;
  		switch (p->p_stat) {
  		case SRUN:
--- 517,528 ----
  	/*
  	 * outp/outpri  : stop/sleep process with largest sleeptime < maxslp
  	 * outp2/outpri2: the longest resident process (its swap time)
+ 	 * Init(8) and siblings are not considered swappable. 
  	 */
  	outp = outp2 = NULL;
  	outpri = outpri2 = 0;
  	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
! 		if (!swappable(p) || p->p_pptr == &proc0)
  			continue;
  		switch (p->p_stat) {
  		case SRUN:
***************
*** 531,537 ****
  			
  		case SSLEEP:
  		case SSTOP:
! 			if (p->p_slptime >= maxslp) {
  				uvm_swapout(p);			/* zap! */
  				didswap++;
  			} else if (p->p_slptime > outpri) {
--- 534,543 ----
  			
  		case SSLEEP:
  		case SSTOP:
! 			/* XXX handle n swap devices better */
! 			if (p->p_slptime >= maxslp && 
! 			    uvmexp.swpages - uvmexp.swpginuse >
! 			    uvmexp.nswapdev * atop(round_page(USPACE))) {
  				uvm_swapout(p);			/* zap! */
  				didswap++;
  			} else if (p->p_slptime > outpri) {
***************
*** 547,562 ****
  	 * likely sleeping/stopped or running candidate.  We only do this
  	 * if we are real low on memory since we don't gain much by doing
  	 * it (USPACE bytes).
  	 */
! 	if (didswap == 0 && uvmexp.free <= atop(round_page(USPACE))) {
  		if ((p = outp) == NULL)
  			p = outp2;
  #ifdef DEBUG
  		if (swapdebug & SDB_SWAPOUT)
  			printf("swapout_threads: no duds, try procp %p\n", p);
  #endif
! 		if (p)
  			uvm_swapout(p);
  	}
  }
  
--- 553,669 ----
  	 * likely sleeping/stopped or running candidate.  We only do this
  	 * if we are real low on memory since we don't gain much by doing
  	 * it (USPACE bytes).
+ 	 * The last part of the test after || will send us in the "kill
+ 	 * memory hog" else block below.
  	 */
! 	if (didswap == 0 && (uvmexp.free <= atop(round_page(USPACE)) ||
! 	    uvmexp.nswapdev * atop(round_page(USPACE)) >=
! 	    uvmexp.swpages - uvmexp.swpginuse)) {
  		if ((p = outp) == NULL)
  			p = outp2;
  #ifdef DEBUG
  		if (swapdebug & SDB_SWAPOUT)
  			printf("swapout_threads: no duds, try procp %p\n", p);
  #endif
! 		/* XXX handle n swap devices better */
! 		if (p &&
! 		    uvmexp.swpages - uvmexp.swpginuse > 
! 		    uvmexp.nswapdev * atop(round_page(USPACE)))
  			uvm_swapout(p);
+ 		else {
+ 
+ 			/*
+ 			 * We couldn't swap any process and we are real low
+ 			 * on memory.  We will make room by killing the
+ 			 * biggest memory hog and his little family.
+ 			 * We assume that the biggest memory hog is the
+ 			 * process with the max. amount of page faults.
+ 			 * Init(8) and siblings are not considered here,
+ 			 * neither are processes with rgid == 0.   8]
+ 			 * OpenBSD PR number 137 fixed by:
+ 			 *	Mathias Schmocker smat_(_at_)_acm_(_dot_)_org
+ 			 */
+ 
+ 			/*
+ 			 * Recycle outp, outp2, outpri, outpri2 for finding
+ 			 * the "best" process to kill.
+ 			 */
+ 			outp = outp2 = NULL;
+ 			outpri = outpri2 = 0;
+ 
+ 			for (p = allproc.lh_first; p != 0;
+ 			    p = p->p_list.le_next) {
+ 
+ 				/*
+ 				 * XXX: How to be sure not to kill named ?
+ 				 */
+ 				if (!swappable(p) || p->p_pptr == &proc0 ||
+ 				    p->p_cred->p_rgid == 0)
+ 					continue;
+ 				switch (p->p_stat) {
+ 				case SRUN:
+ 					if (p->p_addr->u_stats.p_ru.ru_majflt >
+ 					    outpri2) {
+ 						outp2 = p;
+ 						outpri2 =
+ 						p->p_addr->u_stats.p_ru.ru_majflt;
+ 					}
+ 					continue;
+ 				case SSLEEP:
+ 				case SSTOP:
+ 					if (p->p_addr->u_stats.p_ru.ru_majflt >
+ 				   	 outpri) {
+ 						outp = p;
+ 						outpri =
+ 						p->p_addr->u_stats.p_ru.ru_majflt;
+ 					}
+ 					continue;
+ 				}
+ 			}
+ 
+ 			if ((p = outp2) == NULL) /* Favor running processes. */
+ 				p = outp;
+ 
+ 			if (p) {
+ 				s = splhigh();
+ #ifdef UVM_LOG_CHOKE
+ 				/*
+ 				 * We record the interesting data
+ 				 * of the biggest memory hog. Don't
+ 				 * overwrite if it is not yet logged.
+ 				 */
+ 
+ 				if (uvm.mem_hog_pid == 0) {
+ 					uvm.mem_hog_pid = p->p_pid;
+ 					uvm.mem_hog_ruid = p->p_cred->p_ruid;
+ 					uvm.mem_hog_rgid = p->p_cred->p_rgid;
+ 
+ 					strncpy(uvm.mem_hog_name,
+ 						p->p_comm, MAXCOMLEN);
+ 				}
+ #endif
+ 
+ 				/*
+ 				 * Kill the memory hog and all his
+ 				 * family, up to init's child.
+ 				 * (Had to read compiler output for getting
+ 				 * this right...)
+ 				 * We tweak the user priority for getting
+ 				 * out of this real fast (may help when swap
+ 				 * is encrypted).
+ 				 */
+ 
+ 				while (p->p_pptr != &proc0) {
+ 					p->p_usrpri = PUSER; /* XXX */
+ 					psignal(p, SIGKILL);
+ 					if ((p->p_pptr)->p_pptr != &proc0)
+ 						p = p->p_pptr;
+ 					else
+ 						break;
+ 				}
+ 				splx(s);
+ 			}
+ 		}
  	}
  }
  
Index: uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.6
diff -c -r1.6 uvm_pdaemon.c
*** uvm_pdaemon.c	2000/11/10 15:33:11	1.6
--- uvm_pdaemon.c	2000/12/30 18:38:10
***************
*** 75,80 ****
--- 75,83 ----
  #include <sys/systm.h>
  #include <sys/kernel.h>
  #include <sys/pool.h>
+ #ifdef UVM_LOG_CHOKE
+ #include <sys/syslog.h>
+ #endif
  
  #include <vm/vm.h>
  #include <vm/vm_page.h>
***************
*** 150,155 ****
--- 153,175 ----
  	    timo);
  
  	splx(s);
+ 
+ #ifdef UVM_LOG_CHOKE
+ 	/*
+ 	 * We wait until normal conditions for logging the fact.
+ 	 */
+ 
+ 	if (uvm.mem_hog_pid && uvmexp.free > uvmexp.freetarg) {
+ 		log(LOG_ALERT,
+ 		"%s[%d] killed, ruid=%d rgid=%d: memory and swap exhausted\n",
+ 			uvm.mem_hog_name, uvm.mem_hog_pid,
+ 			uvm.mem_hog_ruid, uvm.mem_hog_rgid);
+ 		uvm.mem_hog_pid = 0;
+ 		uvm.mem_hog_ruid = 0;
+ 		uvm.mem_hog_rgid = 0;
+ 		strncpy (uvm.mem_hog_name, "", MAXCOMLEN);
+ 	}
+ #endif
  }
  
  
***************
*** 546,552 ****
  			}
  #endif
  			if ((p->pqflags & PQ_SWAPBACKED) &&
! 			    uvmexp.swpgonly == uvmexp.swpages) {
  				dirtyreacts++;
  				uvm_pageactivate(p);
  				if (anon) {
--- 566,572 ----
  			}
  #endif
  			if ((p->pqflags & PQ_SWAPBACKED) &&
! 			    uvmexp.swpgonly == uvmexp.swpages) { /* XXX ?? */
  				dirtyreacts++;
  				uvm_pageactivate(p);
  				if (anon) {
***************
*** 933,939 ****
   * => called with pageq's locked
   */
  
! void
  uvmpd_scan()
  {
  	int s, free, inactive_shortage, swap_shortage, pages_freed;
--- 953,959 ----
   * => called with pageq's locked
   */
  
! static void
  uvmpd_scan()
  {
  	int s, free, inactive_shortage, swap_shortage, pages_freed;
Index: uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.20
diff -c -r1.20 uvm_swap.c
*** uvm_swap.c	2000/09/07 20:15:28	1.20
--- uvm_swap.c	2000/12/30 18:38:20
***************
*** 990,995 ****
--- 990,997 ----
  #endif /* defined(NFSCLIENT) */
  	dev_t dev;
  	char *name;
+ 	int s, n;
+ 
  	UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
  
  	/*
***************
*** 1032,1037 ****
--- 1034,1040 ----
  			error = ENXIO;
  			goto bad;
  		}
+ 		sdp->swd_maxactive = 8; /* XXX */
  		break;
  
  #ifdef SWAP_TO_FILES
***************
*** 1171,1177 ****
  	 */
  	uvm_anon_add(size);
  
! #if 0
  	/*
  	 * At this point we could arrange to reserve memory for the
  	 * swap buffer pools.
--- 1174,1180 ----
  	 */
  	uvm_anon_add(size);
  
! #if 1
  	/*
  	 * At this point we could arrange to reserve memory for the
  	 * swap buffer pools.
***************
*** 1182,1189 ****
  	 * we always have some resources to continue operation.
  	 */
  
! 	int s = splbio();
! 	int n = 8 * sdp->swd_maxactive;
  
  	(void)pool_prime(swapbuf_pool, n, 0);
  
--- 1185,1192 ----
  	 * we always have some resources to continue operation.
  	 */
  
! 	s = splbio();
! 	n = 8 * sdp->swd_maxactive;
  
  	(void)pool_prime(swapbuf_pool, n, 0);
  
--- end of output of cvs diff -c *.h and *.c in /sys/uvm

--- suggestion for arch-independent GENERIC kernel compile option
in /sys/conf/GENERIC, below UVM_SWAP_ENCRYPT
  option		UVM_SWAP_ENCRYPT# support encryption of pages going to swap
+ #option		UVM_LOG_CHOKE	# use log(9) for reporting memory exhaustion

Index: options.4
===================================================================
RCS file: /cvs/src/share/man/man4/options.4,v
retrieving revision 1.56
diff -c -r1.56 options.4
*** options.4	2000/11/20 08:00:01	1.56
--- options.4	2001/01/01 16:46:44
***************
*** 593,598 ****
--- 593,601 ----
  and
  .Xr sysctl 3
  for details.
+ .It Cd option UVM_LOG_CHOKE
+ Allows logging of memory and swap exhaustion after conditions return back
+ to normal and memory hungry processes are killed.
  .El
  .Ss Networking Options
  .Bl -ohang


These patches were tested in the 8 following combinations:
	UVM_LOG_CHOKE	swap encrypted	swap size (512-blocks)
	  no		  yes		  366849
	  no		  yes		  733698
	  no		  no		  366849
	  no		  no		  733698
	  yes		  yes		  366849
	  yes		  yes		  733698
	  yes		  no		  366849
	  yes		  no		  733698

Swap space was made up by 1 or 2 identical b partitions.
The following script was used for testing from a non-root
user account, ulimit where not changed, malloc(3) options
where not used.
#!/bin/sh
(while true; do grep 1 /dev/zero & done)

One VT, single shot with the script, then letting the system
recover.

5 VTs running the script, sustained test during 15 minutes
restarting the script upon "cannot fork - try again" or after
login again if the shell was killed. Letting the system
recover from itself after 15 minutes (the recovery could
take up to 8 minutes).

In the large swap size, non encrypted combinations, I got
the "pagedaemon: deadlock detected!" message during the 15
minutes sustained test and the recovery. But switching
VTs saved the ball !! Kernel said "pcvt: scrollback memory
malloc failed"  and/or dumped core for grep/sh/getty and
continued happily to trash his way out of this...

For completeness, dmesg output:
(FYI, the pcvt patches I posted before where included in the kernel sources)

OpenBSD 2.8-current (GENERIC.GATEWAY.NETATALK.XFS.DMA) #194: Sat Dec 30 18:27:34 CET 2000
    smat_(_at_)_polaris:/usr/src/sys/arch/i386/compile/GENERIC.GATEWAY.NETATALK.XFS.DMA
cpu0: AMD K6-2 ("AuthenticAMD" 586-class) 267 MHz
cpu0: FPU,V86,DE,PSE,TSC,MSR,MCE,CX8,PGE,MMX
real mem  = 133787648 (130652K)
avail mem = 118947840 (116160K)
using 1658 buffers containing 6791168 bytes (6632K) of memory
mainbus0 (root)
bios0 at mainbus0: AT/286+(90) BIOS, date 10/14/98, BIOS32 rev. 0 @ 0xf0530
apm0 at bios0: Power Management spec V1.2
apm0: AC on, battery charge unknown
pcibios0 at bios0: rev. 2.1 found at 0xf0000[0xbe2]
pcibios0: PCI IRQ Routing Table rev. 1.0 found at 0xf0b40, size 160 bytes (8 entries)
pcibios0: PCI Interrupt Router at 000:07:0 ("Acer Labs M1543 PCI-ISA" rev 0x00)
pci_intr_fixup: no compatible PCI ICU found: ICU vendor 0x10b9 product 0x1533
pcibios0: Warning, unable to fix up PCI interrupt routing
pcibios0: PCI bus #1 is the last bus
pci0 at mainbus0 bus 0: configuration mode 1 (no bios)
pchb0 at pci0 dev 0 function 0 "Acer Labs M1543 Host-PCI" rev 0x04
ppb0 at pci0 dev 1 function 0 "Acer Labs M5243 AGP/PCI-PCI" rev 0x04
pci1 at ppb0 bus 1
"Acer Labs M7101 Power Management" rev 0x00 at pci0 dev 3 function 0 not configured
pcib0 at pci0 dev 7 function 0 "Acer Labs M1543 PCI-ISA" rev 0xc3
xl0 at pci0 dev 9 function 0 "3Com 3c900B 10Mbps-Combo" rev 0x04: irq 6 address 00:50:04:00:f1:62
xl0: selecting 10baseT transceiver, half duplex
xl1 at pci0 dev 10 function 0 "3Com 3c900 10Mbps-Combo" rev 0x00: irq 5 address 00:60:08:a0:71:1b
xl1: selecting BNC port, half duplex
"S3 ViRGE DX/GX" rev 0x01 at pci0 dev 11 function 0 not configured
xl2 at pci0 dev 12 function 0 "3Com 3c900B 10Mbps-Combo" rev 0x04: irq 11 address 00:50:04:00:f1:ae
xl2: selecting 10baseT transceiver, half duplex
xl3 at pci0 dev 13 function 0 "3Com 3c900 10Mbps-Combo" rev 0x00: irq 6 address 00:10:4b:9d:59:a0
xl3: selecting 10baseT transceiver, half duplex
pciide0 at pci0 dev 15 function 0 "Acer Labs M5229 UDMA IDE" rev 0xc1: DMA (unsupported), channel 0 configured to compatibility, channel 1 configured to compatibility
pciide0: channel 0 interrupting at irq 14
wd0 at pciide0 channel 0 drive 0: <QUANTUM FIREBALLP LM15>
wd0: can use 16-bit, PIO mode 4
wd0: 16-sector PIO, LBA, 14324MB, 16383 cyl, 16 head, 63 sec, 29336832 sectors
pciide0: channel 1 interrupting at irq 15
atapiscsi0 at pciide0 channel 1
scsibus0 at atapiscsi0: 2 targets
cd0 at scsibus0 targ 1 lun 0: <LITEON, CD-ROM LTN485S, JL1F> SCSI0 5/cdrom removable
cd0: can use 16-bit, PIO mode 4
wd1 at pciide0 channel 1 drive 0: <QUANTUM FIREBALLP LM15>
wd1: can use 16-bit, PIO mode 4
wd1: 16-sector PIO, LBA, 14324MB, 16383 cyl, 16 head, 63 sec, 29336832 sectors
isa0 at pcib0
isadma0 at isa0
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
midi0 at pcppi0: <PC speaker>
sysbeep0 at pcppi0
lpt0 at isa0 port 0x378/4 irq 7
npx0 at isa0 port 0xf0/16: using exception 16
pccom0 at isa0 port 0x3f8/8 irq 4: ns16550a, 16 byte fifo
pccom1 at isa0 port 0x2f8/8 irq 3: ns16550a, 16 byte fifo
vt0 at isa0 port 0x60/16 irq 1: vga 80 col, color, 8 scr, mf2-kbd
pms0 at vt0 irq 12
biomask c000 netmask c860 ttymask d8e2
pctr: user-level cycle counter enabled
mtrr: K6-family MTRR support (2 registers)
dkcsum: wd0 matched BIOS disk 80
dkcsum: wd1 matched BIOS disk 81
root on wd0a
rootdev=0x0 rrootdev=0x300 rawdev=0x302



Visit your host, monkey.org