Subject: Re: bug in NetBSD and OpenBSD awk
To: None <mason@primenet.com.au>
From: Aleksey Cheusov <cheusov@tut.by>
List: tech-userlevel
Date: 06/22/2006 22:38:31
--=-=-=

> Aleksey Cheusov <cheusov@tut.by> typed:
> : NetBSD 3-stable and OpenBSD-3.8 /usr/bin/awk work incorrectly.

> Seems to have problems near the end of the pattern.  The input was 145 chars.
> Taking off the EOL match $ :

> This is worth a PR.
I've sent PR more than month ago, see PR bin/33392.
It seems that BSD awk has limitation on a number of states in DFA.

An attachment file contains fix for this.
There is one important side-effect of this patch:
awk regexp works much faster (more than 50(!!!) times in some of my cases).

A few notes about patch:
- tables and arrays are resized automatically as needed
- "reset" member with appropriate code is removed because
  the only place where it is set to 1 is removed too together with NSTATES
  define.

I assume this bug is very serious and really needs to be fixed ASAP.


--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=awk-big-regexp.patch
Content-Description: patch for awk

Index: b.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/b.c,v
retrieving revision 1.5
diff -u -u -r1.5 b.c
--- b.c	26 Oct 2003 13:39:38 -0000	1.5
+++ b.c	22 Jun 2006 19:11:28 -0000
@@ -30,6 +30,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
+
 #include "awk.h"
 #include "awkgram.h"
 
@@ -54,6 +56,23 @@
 	parent contains pointer to parent
 */
 
+#define REALLOC(ptr, items) \
+   ptr = realloc (ptr, items * sizeof (*ptr));
+
+#define ACCESS_STATE(fa, state)                                   \
+   if (state >= fa->state_count) {                                \
+      int i;                                                      \
+      int new_count = state * 5 / 4 + 10; /* needs to be tuned */ \
+      REALLOC (fa->gototab, new_count);                           \
+      REALLOC (fa->out, new_count);                               \
+      REALLOC (fa->posns, new_count);                             \
+      for (i = fa->state_count; i < new_count; ++i){              \
+		 fa->gototab [i] = calloc (1, NCHARS * sizeof (*fa->gototab)); \
+		 fa->out [i]   = 0;                                       \
+		 fa->posns [i] = NULL;                                    \
+      }                                                           \
+      fa->state_count = new_count;                                \
+   }
 
 int	*setvec;
 int	*tmpset;
@@ -136,6 +155,9 @@
 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
 	cfoll(f, p1);	/* set up follow sets */
 	freetr(p1);
+
+	ACCESS_STATE (f, 1);
+
 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
 			overflo("out of space in makedfa");
 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
@@ -151,9 +173,10 @@
 {
 	int i, k;
 
+	ACCESS_STATE (f, 2);
+
 	f->curstat = 2;
 	f->out[2] = 0;
-	f->reset = 0;
 	k = *(f->re[0].lfollow);
 	xfree(f->posns[2]);			
 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
@@ -173,8 +196,10 @@
 		}
 
 		f->out[0] = f->out[2];
-		if (f->curstat != 2)
+		if (f->curstat != 2){
+			ACCESS_STATE (f, f->curstat);
 			--(*f->posns[f->curstat]);
+		}
 	}
 	return f->curstat;
 }
@@ -461,7 +486,10 @@
 	int s, ns;
 	uschar *p = (uschar *) p0;
 
-	s = f->reset ? makeinit(f,0) : f->initstat;
+	s = f->initstat;
+
+	assert (s < f->state_count);
+
 	if (f->out[s])
 		return(1);
 	do {
@@ -469,6 +497,9 @@
 			s = ns;
 		else
 			s = cgoto(f, s, *p);
+
+		assert (s < f->state_count);
+
 		if (f->out[s])
 			return(1);
 	} while (*p++ != 0);
@@ -480,9 +511,11 @@
 	int s, ns;
 	uschar *p = (uschar *) p0;
 	uschar *q;
-	int i, k;
 
-	s = f->reset ? makeinit(f,1) : f->initstat;
+	s = f->initstat;
+
+	assert(s < f->state_count);
+
 	patbeg = p;
 	patlen = -1;
 	do {
@@ -494,6 +527,9 @@
 				s = ns;
 			else
 				s = cgoto(f, s, *q);
+
+			assert(s < f->state_count);
+
 			if (s == 1) {	/* no transition */
 				if (patlen >= 0) {
 					patbeg = (char *) p;
@@ -511,19 +547,6 @@
 		}
 	nextin:
 		s = 2;
-		if (f->reset) {
-			for (i = 2; i <= f->curstat; i++)
-				xfree(f->posns[i]);
-			k = *f->posns[0];			
-			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
-				overflo("out of space in pmatch");
-			for (i = 0; i <= k; i++)
-				(f->posns[2])[i] = (f->posns[0])[i];
-			f->initstat = f->curstat = 2;
-			f->out[2] = f->out[0];
-			for (i = 0; i < NCHARS; i++)
-				f->gototab[2][i] = 0;
-		}
 	} while (*p++ != 0);
 	return (0);
 }
@@ -533,9 +556,11 @@
 	int s, ns;
 	uschar *p = (uschar *) p0;
 	uschar *q;
-	int i, k;
 
-	s = f->reset ? makeinit(f,1) : f->initstat;
+	s = f->initstat;
+
+	assert(s < f->state_count); 
+
 	patlen = -1;
 	while (*p) {
 		q = p;
@@ -546,6 +571,9 @@
 				s = ns;
 			else
 				s = cgoto(f, s, *q);
+
+			assert(s < f->state_count);
+
 			if (s == 1) {	/* no transition */
 				if (patlen > 0) {
 					patbeg = (char *) p;
@@ -562,19 +590,6 @@
 		}
 	nnextin:
 		s = 2;
-		if (f->reset) {
-			for (i = 2; i <= f->curstat; i++)
-				xfree(f->posns[i]);
-			k = *f->posns[0];			
-			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
-				overflo("out of state space");
-			for (i = 0; i <= k; i++)
-				(f->posns[2])[i] = (f->posns[0])[i];
-			f->initstat = f->curstat = 2;
-			f->out[2] = f->out[0];
-			for (i = 0; i < NCHARS; i++)
-				f->gototab[2][i] = 0;
-		}
 		p++;
 	}
 	return (0);
@@ -835,6 +850,7 @@
 		setvec[i] = 0;
 	setcnt = 0;
 	/* compute positions of gototab[s,c] into setvec */
+	ACCESS_STATE (f, s);
 	p = f->posns[s];
 	for (i = 1; i <= *p; i++) {
 		if ((k = f->re[p[i]].ltype) != FINAL) {
@@ -867,6 +883,10 @@
 		if (setvec[i]) {
 			tmpset[j++] = i;
 		}
+
+	ACCESS_STATE (f, f->curstat);
+	ACCESS_STATE (f, s);
+
 	/* tmpset == previous state? */
 	for (i = 1; i <= f->curstat; i++) {
 		p = f->posns[i];
@@ -882,13 +902,9 @@
 	}
 
 	/* add tmpset to current set of states */
-	if (f->curstat >= NSTATES-1) {
-		f->curstat = 2;
-		f->reset = 1;
-		for (i = 2; i < NSTATES; i++)
-			xfree(f->posns[i]);
-	} else
-		++(f->curstat);
+	++(f->curstat);
+	ACCESS_STATE (f, f->curstat);
+
 	for (i = 0; i < NCHARS; i++)
 		f->gototab[f->curstat][i] = 0;
 	xfree(f->posns[f->curstat]);
@@ -913,13 +929,18 @@
 
 	if (f == NULL)
 		return;
-	for (i = 0; i <= f->curstat; i++)
+	for (i = 0; i <= f->state_count; i++){
+		xfree(f->gototab[i]);
 		xfree(f->posns[i]);
+	}
 	for (i = 0; i <= f->accept; i++) {
 		xfree(f->re[i].lfollow);
 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
 			xfree((f->re[i].lval.np));
 	}
 	xfree(f->restr);
+	xfree(f->out);
+	xfree(f->posns);
+	xfree(f->gototab);
 	xfree(f);
 }
Index: awk.h
===================================================================
RCS file: /cvsroot/src/dist/nawk/awk.h,v
retrieving revision 1.4
diff -u -u -r1.4 awk.h
--- awk.h	26 Oct 2003 11:34:23 -0000	1.4
+++ awk.h	22 Jun 2006 19:11:28 -0000
@@ -205,7 +205,6 @@
 
 #define NCHARS	(256+1)		/* 256 handles 8-bit chars; 128 does 7-bit */
 				/* watch out in match(), etc. */
-#define NSTATES	32
 
 typedef struct rrow {
 	long	ltype;	/* long avoids pointer warnings on 64-bit */
@@ -218,16 +217,16 @@
 } rrow;
 
 typedef struct fa {
-	uschar	gototab[NSTATES][NCHARS];
-	uschar	out[NSTATES];
-	uschar	*restr;
-	int	*posns[NSTATES];
+	int **gototab;
+	uschar *out;
+	uschar *restr;
+	int	**posns;
+	int state_count;
 	int	anchor;
 	int	use;
 	int	initstat;
 	int	curstat;
 	int	accept;
-	int	reset;
 	struct	rrow re[1];	/* variable: actual size set by calling malloc */
 } fa;
 

--=-=-=


-- 
Best regards, Aleksey Cheusov.

--=-=-=--