[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: kernel/137: vm hangs doing (while true; do grep xyzzy /dev/zero & done)
- To: bugs_(_at_)_cvs_(_dot_)_openbsd_(_dot_)_org, gnats_(_at_)_cvs_(_dot_)_openbsd_(_dot_)_org
- Subject: Re: kernel/137: vm hangs doing (while true; do grep xyzzy /dev/zero & done)
- From: Mathias Schmocker <smat_(_at_)_acm_(_dot_)_org>
- Date: Mon, 1 Jan 2001 19:29:57 +0100 (CET)
Could you please review these patches carefully, they solve PR# 137 on my
machine (i386, OpenBSD 2.8). How I tested them is explained below.
I can't test them on other architectures.
Mathias Schmocker <smat_(_at_)_acm_(_dot_)_org>
--- output of cvs diff -c *.h and *.c in /sys/uvm
Index: uvm.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm.h,v
retrieving revision 1.5
diff -c -r1.5 uvm.h
*** uvm.h 2000/05/27 21:06:08 1.5
--- uvm.h 2000/12/30 18:37:39
***************
*** 110,115 ****
--- 110,123 ----
/* kernel object: to support anonymous pageable kernel memory */
struct uvm_object *kernel_object;
+
+ #ifdef UVM_LOG_CHOKE
+ /* Data sent to log(9) after surviving a memory and swap exhaustion. */
+ pid_t mem_hog_pid; /* Process identifier. */
+ uid_t mem_hog_ruid; /* Real user id. */
+ gid_t mem_hog_rgid; /* Real group id. */
+ char mem_hog_name[MAXCOMLEN+1]; /* Copy of p->p_comm */
+ #endif
};
extern struct uvm uvm;
Index: uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.5
diff -c -r1.5 uvm_fault.c
*** uvm_fault.c 2000/03/16 22:11:04 1.5
--- uvm_fault.c 2000/12/30 18:37:55
***************
*** 1171,1181 ****
uvm_anfree(anon);
uvmfault_unlockall(&ufi, amap, uobj, oanon);
#ifdef DIAGNOSTIC
! if (uvmexp.swpgonly > uvmexp.swpages) {
! panic("uvmexp.swpgonly botch");
}
#endif
! if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
uvmexp.fltnoanon++;
--- 1171,1184 ----
uvm_anfree(anon);
uvmfault_unlockall(&ufi, amap, uobj, oanon);
#ifdef DIAGNOSTIC
! if (uvmexp.swpginuse > uvmexp.swpages) {
! panic("uvmexp.swpginuse botch");
}
#endif
! /* XXX handle n swap devices better */
! if (anon == NULL ||
! uvmexp.nswapdev * atop(round_page(USPACE)) >=
! uvmexp.swpages - uvmexp.swpginuse) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
uvmexp.fltnoanon++;
***************
*** 1576,1586 ****
/* unlock and fail ... */
uvmfault_unlockall(&ufi, amap, uobj, NULL);
#ifdef DIAGNOSTIC
! if (uvmexp.swpgonly > uvmexp.swpages) {
! panic("uvmexp.swpgonly botch");
}
#endif
! if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
UVMHIST_LOG(maphist, " promote: out of VM",
0,0,0,0);
uvmexp.fltnoanon++;
--- 1579,1592 ----
/* unlock and fail ... */
uvmfault_unlockall(&ufi, amap, uobj, NULL);
#ifdef DIAGNOSTIC
! if (uvmexp.swpginuse > uvmexp.swpages) {
! panic("uvmexp.swpginuse botch");
}
#endif
! /* XXX handle n swap devices better */
! if (anon == NULL ||
! uvmexp.nswapdev * atop(round_page(USPACE)) >=
! uvmexp.swpages - uvmexp.swpginuse) {
UVMHIST_LOG(maphist, " promote: out of VM",
0,0,0,0);
uvmexp.fltnoanon++;
Index: uvm_glue.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_glue.c,v
retrieving revision 1.8
diff -c -r1.8 uvm_glue.c
*** uvm_glue.c 2000/09/07 20:15:28 1.8
--- uvm_glue.c 2000/12/30 18:37:58
***************
*** 76,81 ****
--- 76,82 ----
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/user.h>
+ #include <sys/signalvar.h>
#ifdef SYSVSHM
#include <sys/shm.h>
#endif
***************
*** 502,508 ****
{
register struct proc *p;
struct proc *outp, *outp2;
! int outpri, outpri2;
int didswap = 0;
extern int maxslp;
/* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */
--- 503,510 ----
{
register struct proc *p;
struct proc *outp, *outp2;
! long outpri, outpri2;
! int s;
int didswap = 0;
extern int maxslp;
/* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */
***************
*** 515,525 ****
/*
* outp/outpri : stop/sleep process with largest sleeptime < maxslp
* outp2/outpri2: the longest resident process (its swap time)
*/
outp = outp2 = NULL;
outpri = outpri2 = 0;
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
! if (!swappable(p))
continue;
switch (p->p_stat) {
case SRUN:
--- 517,528 ----
/*
* outp/outpri : stop/sleep process with largest sleeptime < maxslp
* outp2/outpri2: the longest resident process (its swap time)
+ * Init(8) and siblings are not considered swappable.
*/
outp = outp2 = NULL;
outpri = outpri2 = 0;
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
! if (!swappable(p) || p->p_pptr == &proc0)
continue;
switch (p->p_stat) {
case SRUN:
***************
*** 531,537 ****
case SSLEEP:
case SSTOP:
! if (p->p_slptime >= maxslp) {
uvm_swapout(p); /* zap! */
didswap++;
} else if (p->p_slptime > outpri) {
--- 534,543 ----
case SSLEEP:
case SSTOP:
! /* XXX handle n swap devices better */
! if (p->p_slptime >= maxslp &&
! uvmexp.swpages - uvmexp.swpginuse >
! uvmexp.nswapdev * atop(round_page(USPACE))) {
uvm_swapout(p); /* zap! */
didswap++;
} else if (p->p_slptime > outpri) {
***************
*** 547,562 ****
* likely sleeping/stopped or running candidate. We only do this
* if we are real low on memory since we don't gain much by doing
* it (USPACE bytes).
*/
! if (didswap == 0 && uvmexp.free <= atop(round_page(USPACE))) {
if ((p = outp) == NULL)
p = outp2;
#ifdef DEBUG
if (swapdebug & SDB_SWAPOUT)
printf("swapout_threads: no duds, try procp %p\n", p);
#endif
! if (p)
uvm_swapout(p);
}
}
--- 553,669 ----
* likely sleeping/stopped or running candidate. We only do this
* if we are real low on memory since we don't gain much by doing
* it (USPACE bytes).
+ * The last part of the test after || will send us in the "kill
+ * memory hog" else block below.
*/
! if (didswap == 0 && (uvmexp.free <= atop(round_page(USPACE)) ||
! uvmexp.nswapdev * atop(round_page(USPACE)) >=
! uvmexp.swpages - uvmexp.swpginuse)) {
if ((p = outp) == NULL)
p = outp2;
#ifdef DEBUG
if (swapdebug & SDB_SWAPOUT)
printf("swapout_threads: no duds, try procp %p\n", p);
#endif
! /* XXX handle n swap devices better */
! if (p &&
! uvmexp.swpages - uvmexp.swpginuse >
! uvmexp.nswapdev * atop(round_page(USPACE)))
uvm_swapout(p);
+ else {
+
+ /*
+ * We couldn't swap any process and we are real low
+ * on memory. We will make room by killing the
+ * biggest memory hog and his little family.
+ * We assume that the biggest memory hog is the
+ * process with the max. amount of page faults.
+ * Init(8) and siblings are not considered here,
+ * neither are processes with rgid == 0. 8]
+ * OpenBSD PR number 137 fixed by:
+ * Mathias Schmocker smat_(_at_)_acm_(_dot_)_org
+ */
+
+ /*
+ * Recycle outp, outp2, outpri, outpri2 for finding
+ * the "best" process to kill.
+ */
+ outp = outp2 = NULL;
+ outpri = outpri2 = 0;
+
+ for (p = allproc.lh_first; p != 0;
+ p = p->p_list.le_next) {
+
+ /*
+ * XXX: How to be sure not to kill named ?
+ */
+ if (!swappable(p) || p->p_pptr == &proc0 ||
+ p->p_cred->p_rgid == 0)
+ continue;
+ switch (p->p_stat) {
+ case SRUN:
+ if (p->p_addr->u_stats.p_ru.ru_majflt >
+ outpri2) {
+ outp2 = p;
+ outpri2 =
+ p->p_addr->u_stats.p_ru.ru_majflt;
+ }
+ continue;
+ case SSLEEP:
+ case SSTOP:
+ if (p->p_addr->u_stats.p_ru.ru_majflt >
+ outpri) {
+ outp = p;
+ outpri =
+ p->p_addr->u_stats.p_ru.ru_majflt;
+ }
+ continue;
+ }
+ }
+
+ if ((p = outp2) == NULL) /* Favor running processes. */
+ p = outp;
+
+ if (p) {
+ s = splhigh();
+ #ifdef UVM_LOG_CHOKE
+ /*
+ * We record the interesting data
+ * of the biggest memory hog. Don't
+ * overwrite if it is not yet logged.
+ */
+
+ if (uvm.mem_hog_pid == 0) {
+ uvm.mem_hog_pid = p->p_pid;
+ uvm.mem_hog_ruid = p->p_cred->p_ruid;
+ uvm.mem_hog_rgid = p->p_cred->p_rgid;
+
+ strncpy(uvm.mem_hog_name,
+ p->p_comm, MAXCOMLEN);
+ }
+ #endif
+
+ /*
+ * Kill the memory hog and all his
+ * family, up to init's child.
+ * (Had to read compiler output for getting
+ * this right...)
+ * We tweak the user priority for getting
+ * out of this real fast (may help when swap
+ * is encrypted).
+ */
+
+ while (p->p_pptr != &proc0) {
+ p->p_usrpri = PUSER; /* XXX */
+ psignal(p, SIGKILL);
+ if ((p->p_pptr)->p_pptr != &proc0)
+ p = p->p_pptr;
+ else
+ break;
+ }
+ splx(s);
+ }
+ }
}
}
Index: uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.6
diff -c -r1.6 uvm_pdaemon.c
*** uvm_pdaemon.c 2000/11/10 15:33:11 1.6
--- uvm_pdaemon.c 2000/12/30 18:38:10
***************
*** 75,80 ****
--- 75,83 ----
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/pool.h>
+ #ifdef UVM_LOG_CHOKE
+ #include <sys/syslog.h>
+ #endif
#include <vm/vm.h>
#include <vm/vm_page.h>
***************
*** 150,155 ****
--- 153,175 ----
timo);
splx(s);
+
+ #ifdef UVM_LOG_CHOKE
+ /*
+ * We wait until normal conditions for logging the fact.
+ */
+
+ if (uvm.mem_hog_pid && uvmexp.free > uvmexp.freetarg) {
+ log(LOG_ALERT,
+ "%s[%d] killed, ruid=%d rgid=%d: memory and swap exhausted\n",
+ uvm.mem_hog_name, uvm.mem_hog_pid,
+ uvm.mem_hog_ruid, uvm.mem_hog_rgid);
+ uvm.mem_hog_pid = 0;
+ uvm.mem_hog_ruid = 0;
+ uvm.mem_hog_rgid = 0;
+ strncpy (uvm.mem_hog_name, "", MAXCOMLEN);
+ }
+ #endif
}
***************
*** 546,552 ****
}
#endif
if ((p->pqflags & PQ_SWAPBACKED) &&
! uvmexp.swpgonly == uvmexp.swpages) {
dirtyreacts++;
uvm_pageactivate(p);
if (anon) {
--- 566,572 ----
}
#endif
if ((p->pqflags & PQ_SWAPBACKED) &&
! uvmexp.swpgonly == uvmexp.swpages) { /* XXX ?? */
dirtyreacts++;
uvm_pageactivate(p);
if (anon) {
***************
*** 933,939 ****
* => called with pageq's locked
*/
! void
uvmpd_scan()
{
int s, free, inactive_shortage, swap_shortage, pages_freed;
--- 953,959 ----
* => called with pageq's locked
*/
! static void
uvmpd_scan()
{
int s, free, inactive_shortage, swap_shortage, pages_freed;
Index: uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.20
diff -c -r1.20 uvm_swap.c
*** uvm_swap.c 2000/09/07 20:15:28 1.20
--- uvm_swap.c 2000/12/30 18:38:20
***************
*** 990,995 ****
--- 990,997 ----
#endif /* defined(NFSCLIENT) */
dev_t dev;
char *name;
+ int s, n;
+
UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
/*
***************
*** 1032,1037 ****
--- 1034,1040 ----
error = ENXIO;
goto bad;
}
+ sdp->swd_maxactive = 8; /* XXX */
break;
#ifdef SWAP_TO_FILES
***************
*** 1171,1177 ****
*/
uvm_anon_add(size);
! #if 0
/*
* At this point we could arrange to reserve memory for the
* swap buffer pools.
--- 1174,1180 ----
*/
uvm_anon_add(size);
! #if 1
/*
* At this point we could arrange to reserve memory for the
* swap buffer pools.
***************
*** 1182,1189 ****
* we always have some resources to continue operation.
*/
! int s = splbio();
! int n = 8 * sdp->swd_maxactive;
(void)pool_prime(swapbuf_pool, n, 0);
--- 1185,1192 ----
* we always have some resources to continue operation.
*/
! s = splbio();
! n = 8 * sdp->swd_maxactive;
(void)pool_prime(swapbuf_pool, n, 0);
--- end of output of cvs diff -c *.h and *.c in /sys/uvm
--- suggestion for arch-independent GENERIC kernel compile option
in /sys/conf/GENERIC, below UVM_SWAP_ENCRYPT
option UVM_SWAP_ENCRYPT# support encryption of pages going to swap
+ #option UVM_LOG_CHOKE # use log(9) for reporting memory exhaustion
Index: options.4
===================================================================
RCS file: /cvs/src/share/man/man4/options.4,v
retrieving revision 1.56
diff -c -r1.56 options.4
*** options.4 2000/11/20 08:00:01 1.56
--- options.4 2001/01/01 16:46:44
***************
*** 593,598 ****
--- 593,601 ----
and
.Xr sysctl 3
for details.
+ .It Cd option UVM_LOG_CHOKE
+ Allows logging of memory and swap exhaustion after conditions return back
+ to normal and memory hungry processes are killed.
.El
.Ss Networking Options
.Bl -ohang
These patches were tested in the 8 following combinations:
UVM_LOG_CHOKE swap encrypted swap size (512-blocks)
no yes 366849
no yes 733698
no no 366849
no no 733698
yes yes 366849
yes yes 733698
yes no 366849
yes no 733698
Swap space was made up by 1 or 2 identical b partitions.
The following script was used for testing from a non-root
user account, ulimit where not changed, malloc(3) options
where not used.
#!/bin/sh
(while true; do grep 1 /dev/zero & done)
One VT, single shot with the script, then letting the system
recover.
5 VTs running the script, sustained test during 15 minutes
restarting the script upon "cannot fork - try again" or after
login again if the shell was killed. Letting the system
recover from itself after 15 minutes (the recovery could
take up to 8 minutes).
In the large swap size, non encrypted combinations, I got
the "pagedaemon: deadlock detected!" message during the 15
minutes sustained test and the recovery. But switching
VTs saved the ball !! Kernel said "pcvt: scrollback memory
malloc failed" and/or dumped core for grep/sh/getty and
continued happily to trash his way out of this...
For completeness, dmesg output:
(FYI, the pcvt patches I posted before where included in the kernel sources)
OpenBSD 2.8-current (GENERIC.GATEWAY.NETATALK.XFS.DMA) #194: Sat Dec 30 18:27:34 CET 2000
smat_(_at_)_polaris:/usr/src/sys/arch/i386/compile/GENERIC.GATEWAY.NETATALK.XFS.DMA
cpu0: AMD K6-2 ("AuthenticAMD" 586-class) 267 MHz
cpu0: FPU,V86,DE,PSE,TSC,MSR,MCE,CX8,PGE,MMX
real mem = 133787648 (130652K)
avail mem = 118947840 (116160K)
using 1658 buffers containing 6791168 bytes (6632K) of memory
mainbus0 (root)
bios0 at mainbus0: AT/286+(90) BIOS, date 10/14/98, BIOS32 rev. 0 @ 0xf0530
apm0 at bios0: Power Management spec V1.2
apm0: AC on, battery charge unknown
pcibios0 at bios0: rev. 2.1 found at 0xf0000[0xbe2]
pcibios0: PCI IRQ Routing Table rev. 1.0 found at 0xf0b40, size 160 bytes (8 entries)
pcibios0: PCI Interrupt Router at 000:07:0 ("Acer Labs M1543 PCI-ISA" rev 0x00)
pci_intr_fixup: no compatible PCI ICU found: ICU vendor 0x10b9 product 0x1533
pcibios0: Warning, unable to fix up PCI interrupt routing
pcibios0: PCI bus #1 is the last bus
pci0 at mainbus0 bus 0: configuration mode 1 (no bios)
pchb0 at pci0 dev 0 function 0 "Acer Labs M1543 Host-PCI" rev 0x04
ppb0 at pci0 dev 1 function 0 "Acer Labs M5243 AGP/PCI-PCI" rev 0x04
pci1 at ppb0 bus 1
"Acer Labs M7101 Power Management" rev 0x00 at pci0 dev 3 function 0 not configured
pcib0 at pci0 dev 7 function 0 "Acer Labs M1543 PCI-ISA" rev 0xc3
xl0 at pci0 dev 9 function 0 "3Com 3c900B 10Mbps-Combo" rev 0x04: irq 6 address 00:50:04:00:f1:62
xl0: selecting 10baseT transceiver, half duplex
xl1 at pci0 dev 10 function 0 "3Com 3c900 10Mbps-Combo" rev 0x00: irq 5 address 00:60:08:a0:71:1b
xl1: selecting BNC port, half duplex
"S3 ViRGE DX/GX" rev 0x01 at pci0 dev 11 function 0 not configured
xl2 at pci0 dev 12 function 0 "3Com 3c900B 10Mbps-Combo" rev 0x04: irq 11 address 00:50:04:00:f1:ae
xl2: selecting 10baseT transceiver, half duplex
xl3 at pci0 dev 13 function 0 "3Com 3c900 10Mbps-Combo" rev 0x00: irq 6 address 00:10:4b:9d:59:a0
xl3: selecting 10baseT transceiver, half duplex
pciide0 at pci0 dev 15 function 0 "Acer Labs M5229 UDMA IDE" rev 0xc1: DMA (unsupported), channel 0 configured to compatibility, channel 1 configured to compatibility
pciide0: channel 0 interrupting at irq 14
wd0 at pciide0 channel 0 drive 0: <QUANTUM FIREBALLP LM15>
wd0: can use 16-bit, PIO mode 4
wd0: 16-sector PIO, LBA, 14324MB, 16383 cyl, 16 head, 63 sec, 29336832 sectors
pciide0: channel 1 interrupting at irq 15
atapiscsi0 at pciide0 channel 1
scsibus0 at atapiscsi0: 2 targets
cd0 at scsibus0 targ 1 lun 0: <LITEON, CD-ROM LTN485S, JL1F> SCSI0 5/cdrom removable
cd0: can use 16-bit, PIO mode 4
wd1 at pciide0 channel 1 drive 0: <QUANTUM FIREBALLP LM15>
wd1: can use 16-bit, PIO mode 4
wd1: 16-sector PIO, LBA, 14324MB, 16383 cyl, 16 head, 63 sec, 29336832 sectors
isa0 at pcib0
isadma0 at isa0
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
midi0 at pcppi0: <PC speaker>
sysbeep0 at pcppi0
lpt0 at isa0 port 0x378/4 irq 7
npx0 at isa0 port 0xf0/16: using exception 16
pccom0 at isa0 port 0x3f8/8 irq 4: ns16550a, 16 byte fifo
pccom1 at isa0 port 0x2f8/8 irq 3: ns16550a, 16 byte fifo
vt0 at isa0 port 0x60/16 irq 1: vga 80 col, color, 8 scr, mf2-kbd
pms0 at vt0 irq 12
biomask c000 netmask c860 ttymask d8e2
pctr: user-level cycle counter enabled
mtrr: K6-family MTRR support (2 registers)
dkcsum: wd0 matched BIOS disk 80
dkcsum: wd1 matched BIOS disk 81
root on wd0a
rootdev=0x0 rrootdev=0x300 rawdev=0x302
Visit your host, monkey.org