[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: gdiff Re: CVS: cvs.openbsd.org: src



On Mon, 7 Jun 2004 05:35:48 -0400 (EDT)
Ted Unangst <tedu@zeitbombe.org> wrote:

> > is there any reason why we still depend on gdiff?
> > what is wrong with diff(1)?
> 
> still lacks --ignore-matching-lines=regex functionality.  and the 
> different algo means every patch will change, but that's one time pain.

Perhaps a new port Makefile variable can be added, something that
denotes that the port can rely on the diff in the tree instead of
gdiff?

Or is diff -I (ignore matching lines) support imminent? Here's a patch
against diff to add basic support.  The method for doing so is not
very efficient, but it mostly only affects diff when using -I arguments.
The alternative approaches I tried I couldn't get to work (such as
reserving a bit in the hash value or adding another member to the line
structure, which wouldn't be very different from the approach I ended up
choosing anyway).

With this patch, a lot of examples I threw at diff with -I patterns
match the output of gdiff, but I didn't try it on my own ports tree
yet.  That would be the ultimate test, right? :)

There are other areas for improvement (such as concatenating all the -I
patterns together with `\|' spliced in between, so only one regcomp()
needs to be called), but since none I've discovered affects the
bottleneck of the program, I chose to ignore them for now.

Suggestions welcome.

Index: diff.1
===================================================================
RCS file: /cvs/src/usr.bin/diff/diff.1,v
retrieving revision 1.25
diff -u -r1.25 diff.1
--- diff.1	16 Mar 2004 00:40:34 -0000	1.25
+++ diff.1	12 Jun 2004 13:00:17 -0000
@@ -38,31 +38,36 @@
 .Sh SYNOPSIS
 .Nm diff
 .Op Fl abdilpqtTw
+.Op Fl I Ar pattern
 .Oo
-.Fl c | Fl e | Fl f |
-.Fl n | Fl u
+.Fl c | e | f |
+.Fl n | u
 .Oc
 .Op Fl L Ar label
 .Ar file1 file2
 .Nm diff
 .Op Fl abdilpqtTw
+.Op Fl I Ar pattern
 .Op Fl L Ar label
 .Fl C Ar number
 .Ar file1 file2
 .Nm diff
 .Op Fl abdilqtw
+.Op Fl I Ar pattern
 .Fl D Ar string
 .Ar file1 file2
 .Nm diff
 .Op Fl abdilpqtTw
+.Op Fl I Ar pattern
 .Op Fl L Ar label
 .Fl U Ar number
 .Ar file1 file2
 .Nm diff
 .Op Fl abdilNPpqtTw
+.Op Fl I Ar pattern
 .Oo
-.Fl c | Fl e | Fl f |
-.Fl n | Fl u
+.Fl c | e | f |
+.Fl n | u
 .Oc
 .Bk -words
 .Op Fl L Ar label
@@ -97,7 +102,7 @@
 The lines removed from
 .Ar file1
 are marked with
-.Sq \-\ \& ;
+.Sq \&-\ \& ;
 those added to
 .Ar file2
 are marked
@@ -179,7 +184,8 @@
 Comparison options:
 .Bl -tag -width Ds
 .It Fl a
-Treat all files as ASCII.
+Treat all files as
+.Tn ASCII .
 .It Fl b
 Causes trailing blanks (spaces and tabs) to be ignored, and other
 strings of blanks to compare equal.
@@ -187,6 +193,16 @@
 Try very hard to produce a diff as small as possible.
 This may consume a lot of processing power and memory when processing
 large files with many changes.
+.It Fl I Ar pattern
+Ignores changes whose lines match
+.Ar pattern .
+All lines in the change must match for the change to be ignored.
+Multiple
+.Fl I
+patterns may be specified.
+See
+.Xr re_format 7
+for more information on the syntax of regular expression patterns.
 .It Fl i
 Ignores the case of letters.
 E.g.,
@@ -285,8 +301,9 @@
 common subdirectories, and files which appear in only one directory
 are described as such.
 In directory mode only regular files and directories are compared.
-If a non-regular file such as a device special file or FIFO is
-encountered, a diagnostic message is printed.
+If a non-regular file such as a device special file or
+.Tn FIFO
+is encountered, a diagnostic message is printed.
 .Pp
 If only one of
 .Ar file1
@@ -372,11 +389,11 @@
 .Va ZZ .
 .It Li XX,YY Ns Ic c Ns Li ZZ,QQ
 Replace the range
-.Va XX , Ns YY
+.Va XX , Ns Va YY
 from
 .Ar file1
 with the range
-.Va ZZ , Ns QQ
+.Va ZZ , Ns Va QQ
 from
 .Ar file2 .
 .El
@@ -418,7 +435,7 @@
 .El
 .Sh FILES
 .Bl -tag -width /tmp/diff.XXXXXXXX -compact
-.It Pa /tmp/diff.XXXXXXXX
+.It Pa /tmp/diff. Ns Ar XXXXXXXX
 Temporary file used when comparing a device or the standard input.
 Note that the temporary file is unlinked as soon as it is created
 so it will not show up in a directory listing.
@@ -442,7 +459,8 @@
 .Xr diff3 1 ,
 .Xr ed 1 ,
 .Xr pr 1 ,
-.Xr fnmatch 3
+.Xr fnmatch 3 ,
+.Xr re_format 7
 .Sh STANDARDS
 The
 .Nm
Index: diff.c
===================================================================
RCS file: /cvs/src/usr.bin/diff/diff.c,v
retrieving revision 1.45
diff -u -r1.45 diff.c
--- diff.c	16 Mar 2004 00:40:34 -0000	1.45
+++ diff.c	12 Jun 2004 13:00:18 -0000
@@ -46,8 +46,9 @@
 char	*start, *ifdefname, *diffargs, *label;
 struct stat stb1, stb2;
 struct excludes *excludes_list;
+struct ignore_res *ignore_res_list;
 
-#define	OPTIONS	"0123456789abC:cdD:efhiL:lnNPpqrS:sTtU:uwX:x:"
+#define	OPTIONS	"0123456789abC:cdD:efhI:iL:lnNPpqrS:sTtU:uwX:x:"
 static struct option longopts[] = {
 	{ "text",			no_argument,		0,	'a' },
 	{ "ignore-space-change",	no_argument,		0,	'b' },
@@ -56,6 +57,7 @@
 	{ "minimal",			no_argument,		0,	'd' },
 	{ "ed",				no_argument,		0,	'e' },
 	{ "forward-ed",			no_argument,		0,	'f' },
+	{ "ignore-matching-lines",	required_argument,	0,	'I' },
 	{ "ignore-case",		no_argument,		0,	'i' },
 	{ "paginate",			no_argument,		0,	'l' },
 	{ "label",			required_argument,	0,	'L' },
@@ -78,6 +80,7 @@
 
 __dead void usage(void);
 void push_excludes(char *);
+void push_ignore_res(char *);
 void read_excludes_file(char *file);
 void set_argstr(char **, char **);
 
@@ -139,6 +142,9 @@
 		case 'h':
 			/* silently ignore for backwards compatibility */
 			break;
+		case 'I':
+			push_ignore_res(optarg);
+			break;
 		case 'i':
 			iflag = 1;
 			break;
@@ -340,6 +346,24 @@
 	entry->pattern = pattern;
 	entry->next = excludes_list;
 	excludes_list = entry;
+}
+
+void
+push_ignore_res(char *pattern)
+{
+	struct ignore_res *entry;
+	regex_t re;
+	char buf[BUFSIZ];
+	int error;
+
+	if ((error = regcomp(&re, pattern, 0)) != 0) {
+		regerror(error, &re, buf, sizeof(buf));
+		err(2, "%s: %s", pattern, buf);
+	}
+	entry = emalloc(sizeof(*entry));
+	entry->re = re;
+	entry->next = ignore_res_list;
+	ignore_res_list = entry;
 }
 
 void
Index: diff.h
===================================================================
RCS file: /cvs/src/usr.bin/diff/diff.h,v
retrieving revision 1.27
diff -u -r1.27 diff.h
--- diff.h	16 Mar 2004 00:40:34 -0000	1.27
+++ diff.h	12 Jun 2004 13:00:18 -0000
@@ -31,6 +31,9 @@
  *	@(#)diff.h	8.1 (Berkeley) 6/6/93
  */
 
+#include <sys/types.h>
+#include <regex.h>
+
 /*
  * Output format options
  */
@@ -70,12 +73,18 @@
 	struct excludes *next;
 };
 
+struct ignore_res {
+	regex_t re;
+	struct ignore_res *next;
+};
+
 extern int	aflag, bflag, dflag, iflag, lflag, Nflag, Pflag, pflag, rflag,
 		sflag, tflag, Tflag, wflag;
 extern int	format, context, status;
 extern char	*start, *ifdefname, *diffargs, *label;
 extern struct	stat stb1, stb2;
 extern struct	excludes *excludes_list;
+extern struct	ignore_res *ignore_res_list;
 
 char	*splice(char *, char *);
 int	diffreg(char *, char *, int);
Index: diffreg.c
===================================================================
RCS file: /cvs/src/usr.bin/diff/diffreg.c,v
retrieving revision 1.55
diff -u -r1.55 diffreg.c
--- diffreg.c	7 Jan 2004 17:18:32 -0000	1.55
+++ diffreg.c	12 Jun 2004 13:00:19 -0000
@@ -163,6 +163,11 @@
 	int value;
 } *file[2];
 
+struct lbuf {
+	char *line;
+	int pos, max;
+};
+
 /*
  * The following struct is used to record change information when
  * doing a "context" or "unified" diff.  (see routine "change" to
@@ -185,6 +190,7 @@
 static int   pref, suff;	/* length of prefix and suffix */
 static int   slen[2];
 static int   anychange;
+static char *ignore[2];
 static long *ixnew;		/* will be overlaid on file[1] */
 static long *ixold;		/* will be overlaid on klist */
 static struct cand *clist;	/* merely a free storage pot for candidates */
@@ -221,11 +227,13 @@
 static int  skipline(FILE *);
 static int  isqrt(int);
 static int  stone(int *, int, int *, int *);
-static int  readhash(FILE *);
+static int  readhash(FILE *, int *);
 static int  files_differ(FILE *, FILE *, int);
 static __inline int min(int, int);
 static __inline int max(int, int);
 static char *match_function(const long *, int, FILE *);
+static void pushlbuf(struct lbuf *, int, int);
+static __inline int ignoreline(char *);
 
 
 /*
@@ -547,7 +555,8 @@
 prepare(int i, FILE *fd, off_t filesize)
 {
 	struct line *p;
-	int j, h;
+	char *ign = NULL;
+	int ignline, j, h;
 	size_t sz;
 
 	rewind(fd);
@@ -557,15 +566,22 @@
 		sz = 100;
 
 	p = emalloc((sz + 3) * sizeof(struct line));
-	for (j = 0; (h = readhash(fd));) {
+	if (ignore_res_list)
+		ign = emalloc((sz + 3) * sizeof(*ign));
+	for (j = 0; (h = readhash(fd, &ignline));) {
 		if (j == sz) {
 			sz = sz * 3 / 2;
 			p = erealloc(p, (sz + 3) * sizeof(struct line));
+			if (ignore_res_list)
+				ign = erealloc(ign, (sz + 3) * sizeof(*ign));
 		}
 		p[++j].value = h;
+		if (ignore_res_list)
+			ign[j] = ignline;
 	}
 	len[i] = j;
 	file[i] = p;
+	ignore[i] = ign;
 }
 
 static void
@@ -981,6 +997,54 @@
 restart:
 	if (format != D_IFDEF && a > b && c > d)
 		return;
+	if (ignore_res_list) {
+		int proceed;
+
+ 		if (b - a == d - c) {
+			/*
+			 * Change.  Try to follow GNU diff's
+			 * odd behavior.
+			 */
+			 proceed = 0;
+			 for (i = 0; i <= b - a; i++)
+			 	if (ignore[0][a + i] == 0 ||
+				    ignore[1][c + i] == 0) {
+					/*
+					 * Rats, a line wasn't matched.
+					 * That means we can't ignore it.
+					 */
+					proceed = 1;
+					break;
+				}
+			if (proceed == 0)
+				return;	
+		} else if (a == b) {
+			/*
+			 * Delete.  Just like change; all lines must
+			 * contain the pattern to ignore the delete.
+			 */
+			proceed = 0;
+			for (i = d; i <= c; i++)
+				if (ignore[1][i] == 0) {
+					proceed = 1;
+					break;
+				}
+			if (proceed == 0)
+				return;
+		} else if (c == d) {
+			/*
+			 * Insert.  Same as delete.
+			 */
+			proceed = 0;
+			for (i = b; i <= a; i++)
+				if (ignore[0][i] == 0) {
+					proceed = 1;
+					break;
+				}
+			if (proceed == 0)
+				return;
+		}
+	}
 	if (format == D_CONTEXT || format == D_UNIFIED) {
 		/*
 		 * Allocate change records as needed.
@@ -1164,11 +1228,17 @@
  * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578.
  */
 static int
-readhash(FILE *f)
+readhash(FILE *f, int *ignore)
 {
 	int i, t, space;
 	int sum;
+	struct lbuf lbuf;
 
+	if (ignore_res_list) {
+		lbuf.line = NULL;
+		lbuf.max = lbuf.pos = -1;
+	}
+	
 	sum = 1;
 	space = 0;
 	if (!bflag && !wflag) {
@@ -1179,6 +1249,8 @@
 						return (0);
 					break;
 				}
+				if (ignore_res_list)
+					pushlbuf(&lbuf, 80, t);
 				sum = sum * 127 + chrtran[t];
 			}
 		else
@@ -1188,6 +1260,8 @@
 						return (0);
 					break;
 				}
+				if (ignore_res_list)
+					pushlbuf(&lbuf, 80, t);
 				sum = sum * 127 + t;
 			}
 	} else {
@@ -1202,6 +1276,8 @@
 					i++;
 					space = 0;
 				}
+				if (ignore_res_list)
+					pushlbuf(&lbuf, 80, t);
 				sum = sum * 127 + chrtran[t];
 				i++;
 				continue;
@@ -1215,11 +1291,37 @@
 			break;
 		}
 	}
+	if (ignore_res_list) {
+		pushlbuf(&lbuf, 1, '\0');
+		*ignore = ignoreline(lbuf.line);
+		free(lbuf.line);
+	}
 	/*
 	 * There is a remote possibility that we end up with a zero sum.
 	 * Zero is used as an EOF marker, so return 1 instead.
 	 */
 	return (sum == 0 ? 1 : sum);
+}
+
+static void
+pushlbuf(struct lbuf *lb, int inc, int c)
+{
+	if (++lb->pos >= lb->max) {
+		lb->max = max(lb->max, lb->pos) + inc;
+		lb->line = erealloc(lb->line, lb->max * sizeof(char));
+	}
+	lb->line[lb->pos] = c;
+}
+
+static int __inline
+ignoreline(char *line)
+{
+	struct ignore_res *rp;
+
+	for (rp = ignore_res_list; rp != NULL; rp = rp->next)
+		if (regexec(&rp->re, line, 0, NULL, 0) == 0)
+			return 1;
+	return 0;
 }
 
 static int