Subject: Re: bug in NetBSD and OpenBSD awk
To: None <mason@primenet.com.au>
From: Aleksey Cheusov <cheusov@tut.by>
List: tech-userlevel
Date: 06/22/2006 22:38:31
--=-=-=
> Aleksey Cheusov <cheusov@tut.by> typed:
> : NetBSD 3-stable and OpenBSD-3.8 /usr/bin/awk work incorrectly.
> Seems to have problems near the end of the pattern. The input was 145 chars.
> Taking off the EOL match $ :
> This is worth a PR.
I've sent PR more than month ago, see PR bin/33392.
It seems that BSD awk has limitation on a number of states in DFA.
An attachment file contains fix for this.
There is one important side-effect of this patch:
awk regexp works much faster (more than 50(!!!) times in some of my cases).
A few notes about patch:
- tables and arrays are resized automatically as needed
- "reset" member with appropriate code is removed because
the only place where it is set to 1 is removed too together with NSTATES
define.
I assume this bug is very serious and really needs to be fixed ASAP.
--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=awk-big-regexp.patch
Content-Description: patch for awk
Index: b.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/b.c,v
retrieving revision 1.5
diff -u -u -r1.5 b.c
--- b.c 26 Oct 2003 13:39:38 -0000 1.5
+++ b.c 22 Jun 2006 19:11:28 -0000
@@ -30,6 +30,8 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
+#include <assert.h>
+
#include "awk.h"
#include "awkgram.h"
@@ -54,6 +56,23 @@
parent contains pointer to parent
*/
+#define REALLOC(ptr, items) \
+ ptr = realloc (ptr, items * sizeof (*ptr));
+
+#define ACCESS_STATE(fa, state) \
+ if (state >= fa->state_count) { \
+ int i; \
+ int new_count = state * 5 / 4 + 10; /* needs to be tuned */ \
+ REALLOC (fa->gototab, new_count); \
+ REALLOC (fa->out, new_count); \
+ REALLOC (fa->posns, new_count); \
+ for (i = fa->state_count; i < new_count; ++i){ \
+ fa->gototab [i] = calloc (1, NCHARS * sizeof (*fa->gototab)); \
+ fa->out [i] = 0; \
+ fa->posns [i] = NULL; \
+ } \
+ fa->state_count = new_count; \
+ }
int *setvec;
int *tmpset;
@@ -136,6 +155,9 @@
f->accept = poscnt-1; /* penter has computed number of positions in re */
cfoll(f, p1); /* set up follow sets */
freetr(p1);
+
+ ACCESS_STATE (f, 1);
+
if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
overflo("out of space in makedfa");
if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
@@ -151,9 +173,10 @@
{
int i, k;
+ ACCESS_STATE (f, 2);
+
f->curstat = 2;
f->out[2] = 0;
- f->reset = 0;
k = *(f->re[0].lfollow);
xfree(f->posns[2]);
if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
@@ -173,8 +196,10 @@
}
f->out[0] = f->out[2];
- if (f->curstat != 2)
+ if (f->curstat != 2){
+ ACCESS_STATE (f, f->curstat);
--(*f->posns[f->curstat]);
+ }
}
return f->curstat;
}
@@ -461,7 +486,10 @@
int s, ns;
uschar *p = (uschar *) p0;
- s = f->reset ? makeinit(f,0) : f->initstat;
+ s = f->initstat;
+
+ assert (s < f->state_count);
+
if (f->out[s])
return(1);
do {
@@ -469,6 +497,9 @@
s = ns;
else
s = cgoto(f, s, *p);
+
+ assert (s < f->state_count);
+
if (f->out[s])
return(1);
} while (*p++ != 0);
@@ -480,9 +511,11 @@
int s, ns;
uschar *p = (uschar *) p0;
uschar *q;
- int i, k;
- s = f->reset ? makeinit(f,1) : f->initstat;
+ s = f->initstat;
+
+ assert(s < f->state_count);
+
patbeg = p;
patlen = -1;
do {
@@ -494,6 +527,9 @@
s = ns;
else
s = cgoto(f, s, *q);
+
+ assert(s < f->state_count);
+
if (s == 1) { /* no transition */
if (patlen >= 0) {
patbeg = (char *) p;
@@ -511,19 +547,6 @@
}
nextin:
s = 2;
- if (f->reset) {
- for (i = 2; i <= f->curstat; i++)
- xfree(f->posns[i]);
- k = *f->posns[0];
- if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
- overflo("out of space in pmatch");
- for (i = 0; i <= k; i++)
- (f->posns[2])[i] = (f->posns[0])[i];
- f->initstat = f->curstat = 2;
- f->out[2] = f->out[0];
- for (i = 0; i < NCHARS; i++)
- f->gototab[2][i] = 0;
- }
} while (*p++ != 0);
return (0);
}
@@ -533,9 +556,11 @@
int s, ns;
uschar *p = (uschar *) p0;
uschar *q;
- int i, k;
- s = f->reset ? makeinit(f,1) : f->initstat;
+ s = f->initstat;
+
+ assert(s < f->state_count);
+
patlen = -1;
while (*p) {
q = p;
@@ -546,6 +571,9 @@
s = ns;
else
s = cgoto(f, s, *q);
+
+ assert(s < f->state_count);
+
if (s == 1) { /* no transition */
if (patlen > 0) {
patbeg = (char *) p;
@@ -562,19 +590,6 @@
}
nnextin:
s = 2;
- if (f->reset) {
- for (i = 2; i <= f->curstat; i++)
- xfree(f->posns[i]);
- k = *f->posns[0];
- if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
- overflo("out of state space");
- for (i = 0; i <= k; i++)
- (f->posns[2])[i] = (f->posns[0])[i];
- f->initstat = f->curstat = 2;
- f->out[2] = f->out[0];
- for (i = 0; i < NCHARS; i++)
- f->gototab[2][i] = 0;
- }
p++;
}
return (0);
@@ -835,6 +850,7 @@
setvec[i] = 0;
setcnt = 0;
/* compute positions of gototab[s,c] into setvec */
+ ACCESS_STATE (f, s);
p = f->posns[s];
for (i = 1; i <= *p; i++) {
if ((k = f->re[p[i]].ltype) != FINAL) {
@@ -867,6 +883,10 @@
if (setvec[i]) {
tmpset[j++] = i;
}
+
+ ACCESS_STATE (f, f->curstat);
+ ACCESS_STATE (f, s);
+
/* tmpset == previous state? */
for (i = 1; i <= f->curstat; i++) {
p = f->posns[i];
@@ -882,13 +902,9 @@
}
/* add tmpset to current set of states */
- if (f->curstat >= NSTATES-1) {
- f->curstat = 2;
- f->reset = 1;
- for (i = 2; i < NSTATES; i++)
- xfree(f->posns[i]);
- } else
- ++(f->curstat);
+ ++(f->curstat);
+ ACCESS_STATE (f, f->curstat);
+
for (i = 0; i < NCHARS; i++)
f->gototab[f->curstat][i] = 0;
xfree(f->posns[f->curstat]);
@@ -913,13 +929,18 @@
if (f == NULL)
return;
- for (i = 0; i <= f->curstat; i++)
+ for (i = 0; i <= f->state_count; i++){
+ xfree(f->gototab[i]);
xfree(f->posns[i]);
+ }
for (i = 0; i <= f->accept; i++) {
xfree(f->re[i].lfollow);
if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
xfree((f->re[i].lval.np));
}
xfree(f->restr);
+ xfree(f->out);
+ xfree(f->posns);
+ xfree(f->gototab);
xfree(f);
}
Index: awk.h
===================================================================
RCS file: /cvsroot/src/dist/nawk/awk.h,v
retrieving revision 1.4
diff -u -u -r1.4 awk.h
--- awk.h 26 Oct 2003 11:34:23 -0000 1.4
+++ awk.h 22 Jun 2006 19:11:28 -0000
@@ -205,7 +205,6 @@
#define NCHARS (256+1) /* 256 handles 8-bit chars; 128 does 7-bit */
/* watch out in match(), etc. */
-#define NSTATES 32
typedef struct rrow {
long ltype; /* long avoids pointer warnings on 64-bit */
@@ -218,16 +217,16 @@
} rrow;
typedef struct fa {
- uschar gototab[NSTATES][NCHARS];
- uschar out[NSTATES];
- uschar *restr;
- int *posns[NSTATES];
+ int **gototab;
+ uschar *out;
+ uschar *restr;
+ int **posns;
+ int state_count;
int anchor;
int use;
int initstat;
int curstat;
int accept;
- int reset;
struct rrow re[1]; /* variable: actual size set by calling malloc */
} fa;
--=-=-=
--
Best regards, Aleksey Cheusov.
--=-=-=--