[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

du(1): use fmt_scaled(3) and speedup



Hi,

two changes fo du(1):

- Use fmt_scaled(3) for printing -h values. One change in behaviour:  
values between 10 and 100 are printed with an extra decimal:

$ du -h /boot
42K     /boot
$ obj/du -h /boot
42.0K   /boot
$

- Second change is a speedup of the function that checks if a file with
more than one hard link is already counted. -current code uses a linear
search, resulting in O(n^2) behavior.  A test on a fs with lots of files
with link count > 1:

$ time obj/du -sh /scratch3
4.4M    /scratch3
    6.98s real     0.62s user     3.22s system
$ time du -sh /scratch3
4.4M    /scratch3
   65.44s real    59.31s user     3.37s system

roughly 100x speedup in user time, and a 10x speedup in elapsed time.

This is from kientzle@freebsd. His commit messgage:

'Speed up hardlink detection by using a self-sizing hash
table rather than the old linear list search.

On my "hardlink detection torture test", this reduced
user time from 4700 seconds down to 4.2 seconds
and wallclock time from 1:24:48 down to 1:08.
(Yes, that's over one THOUSAND times reduction in user time. ;-)
In the worst case, the new code doubles peak memory usage,
though it could actually reduce memory usage in many cases.'

I played a bit with using tree datastructures from libc (tsearch(3)), but
that produced disappointing results, due to the fact that the tree
degraded to a linear list because of the way inodes are allocated on my
test fs (the tsearch(3) trees are not balanced).

	-Otto

Index: Makefile
===================================================================
RCS file: /cvs/src/usr.bin/du/Makefile,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile
--- Makefile	21 Sep 1997 11:48:54 -0000	1.3
+++ Makefile	12 Jun 2004 11:16:22 -0000
@@ -1,5 +1,7 @@
 #	$OpenBSD: Makefile,v 1.3 1997/09/21 11:48:54 deraadt Exp $
 
 PROG=	du
+DPADD= ${LIBUTIL}
+LDADD= -lutil
 
 .include <bsd.prog.mk>
Index: du.c
===================================================================
RCS file: /cvs/src/usr.bin/du/du.c,v
retrieving revision 1.15
diff -u -p -r1.15 du.c
--- du.c	2 Jun 2004 14:58:46 -0000	1.15
+++ du.c	12 Jun 2004 11:10:05 -0000
@@ -54,18 +54,16 @@ static char rcsid[] = "$OpenBSD: du.c,v 
 #include <err.h>
 #include <errno.h>
 #include <fts.h>
-#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <util.h>
 
-typedef enum { NONE = 0, KILO, MEGA, GIGA, TERA, PETA /* , EXA */ } unit_t;
 
 int	 linkchk(FTSENT *);
 void	 prtout(quad_t, char *, int);
 void	 usage(void);
-unit_t	 unit_adjust(double *);
 
 int
 main(int argc, char *argv[])
@@ -219,87 +217,152 @@ main(int argc, char *argv[])
 	exit(rval);
 }
 
-typedef struct _ID {
-	dev_t	dev;
-	ino_t	inode;
-} ID;
-
 int
 linkchk(FTSENT *p)
 {
-	static ID *files;
-	static int maxfiles, nfiles;
-	ID *fp, *start;
-	ino_t ino;
-	dev_t dev;
-
-	ino = p->fts_statp->st_ino;
-	dev = p->fts_statp->st_dev;
-	if ((start = files) != NULL)
-		for (fp = start + nfiles - 1; fp >= start; --fp)
-			if (ino == fp->inode && dev == fp->dev)
-				return (1);
-
-	if (nfiles == maxfiles && (files = realloc((char *)files,
-	    (u_int)(sizeof(ID) * (maxfiles += 128)))) == NULL)
-		err(1, NULL);
-	files[nfiles].inode = ino;
-	files[nfiles].dev = dev;
-	++nfiles;
-	return (0);
-}
+	struct links_entry {
+		struct links_entry *next;
+		struct links_entry *previous;
+		int	 links;
+		dev_t	 dev;
+		ino_t	 ino;
+	};
+	static const size_t links_hash_initial_size = 8192;
+	static struct links_entry **buckets;
+	static struct links_entry *free_list;
+	static size_t number_buckets;
+	static unsigned long number_entries;
+	static char stop_allocating;
+	struct links_entry *le, **new_buckets;
+	struct stat *st;
+	size_t i, new_size;
+	int count, hash;
+
+	st = p->fts_statp;
+
+	/* If necessary, initialize the hash table. */
+	if (buckets == NULL) {
+		number_buckets = links_hash_initial_size;
+		buckets = malloc(number_buckets * sizeof(buckets[0]));
+		if (buckets == NULL)
+			errx(1, "No memory for hardlink detection");
+		for (i = 0; i < number_buckets; i++)
+			buckets[i] = NULL;
+	}
 
-/*
- * "human-readable" output: use 3 digits max.--put unit suffixes at
- * the end.  Makes output compact and easy-to-read. 
- */
+	/* If the hash table is getting too full, enlarge it. */
+	if (number_entries > number_buckets * 10 && !stop_allocating) {
+		new_size = number_buckets * 2;
+		new_buckets = malloc(new_size * sizeof(struct links_entry *));
+		count = 0;
+
+		/* Try releasing the free list to see if that helps. */
+		if (new_buckets == NULL && free_list != NULL) {
+			while (free_list != NULL) {
+				le = free_list;
+				free_list = le->next;
+				free(le);
+			}
+			new_buckets = malloc(new_size * sizeof(new_buckets[0]));
+		}
 
-unit_t
-unit_adjust(double *val)
-{
-	double abval;
-	unit_t unit;
+		if (new_buckets == NULL) {
+			stop_allocating = 1;
+			warnx("No more memory for tracking hard links");
+		} else {
+			memset(new_buckets, 0,
+			    new_size * sizeof(struct links_entry *));
+			for (i = 0; i < number_buckets; i++) {
+				while (buckets[i] != NULL) {
+					/* Remove entry from old bucket. */
+					le = buckets[i];
+					buckets[i] = le->next;
+
+					/* Add entry to new bucket. */
+					hash = (le->dev ^ le->ino) % new_size;
+
+					if (new_buckets[hash] != NULL)
+						new_buckets[hash]->previous =
+						    le;
+					le->next = new_buckets[hash];
+					le->previous = NULL;
+					new_buckets[hash] = le;
+				}
+			}
+			free(buckets);
+			buckets = new_buckets;
+			number_buckets = new_size;
+		}
+	}
 
-	abval = fabs(*val);
-	if (abval < 1024)
-		unit = NONE;
-	else if (abval < 1048576ULL) {
-		unit = KILO;
-		*val /= 1024;
-	} else if (abval < 1073741824ULL) {
-		unit = MEGA;
-		*val /= 1048576;
-	} else if (abval < 1099511627776ULL) {
-		unit = GIGA;
-		*val /= 1073741824ULL;
-	} else if (abval < 1125899906842624ULL) {
-		unit = TERA;
-		*val /= 1099511627776ULL;
-	} else /* if (abval < 1152921504606846976ULL) */ {
-		unit = PETA;
-		*val /= 1125899906842624ULL;
+	/* Try to locate this entry in the hash table. */
+	hash = ( st->st_dev ^ st->st_ino ) % number_buckets;
+	for (le = buckets[hash]; le != NULL; le = le->next) {
+		if (le->dev == st->st_dev && le->ino == st->st_ino) {
+			/*
+			 * Save memory by releasing an entry when we've seen
+			 * all of it's links.
+			 */
+			if (--le->links <= 0) {
+				if (le->previous != NULL)
+					le->previous->next = le->next;
+				if (le->next != NULL)
+					le->next->previous = le->previous;
+				if (buckets[hash] == le)
+					buckets[hash] = le->next;
+				number_entries--;
+				/* Recycle this node through the free list */
+				if (stop_allocating) {
+					free(le);
+				} else {
+					le->next = free_list;
+					free_list = le;
+				}
+			}
+			return (1);
+		}
+	}
+
+	if (stop_allocating)
+		return (0);
+
+	/* Add this entry to the links cache. */
+	if (free_list != NULL) {
+		/* Pull a node from the free list if we can. */
+		le = free_list;
+		free_list = le->next;
+	} else
+		/* Malloc one if we have to. */
+		le = malloc(sizeof(struct links_entry));
+	if (le == NULL) {
+		stop_allocating = 1;
+		warnx("No more memory for tracking hard links");
+		return (0);
 	}
-	return (unit);
+	le->dev = st->st_dev;
+	le->ino = st->st_ino;
+	le->links = st->st_nlink - 1;
+	number_entries++;
+	le->next = buckets[hash];
+	le->previous = NULL;
+	if (buckets[hash] != NULL)
+		buckets[hash]->previous = le;
+	buckets[hash] = le;
+	return (0);
 }
 
 void
 prtout(quad_t size, char *path, int hflag)
 {
-	unit_t unit;
-	double bytes;
-
 	if (!hflag)
 		(void)printf("%lld\t%s\n", (long long)size, path);
 	else {
-		bytes = (double)size * 512.0;
-		unit = unit_adjust(&bytes);
+		char buf[FMT_SCALED_STRSIZE];
 
-		if (bytes == 0)
-			(void)printf("0B\t%s\n", path);
-		else if (bytes > 10)
-			(void)printf("%.0f%c\t%s\n", bytes, "BKMGTPE"[unit], path);
+		if (fmt_scaled(size * 512, buf) == 0)
+			(void)printf("%s\t%s\n", buf, path);
 		else
-			(void)printf("%.1f%c\t%s\n", bytes, "BKMGTPE"[unit], path);
+			(void)printf("%lld\t%s\n", (long long)size, path);
 	}
 }