Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/external/historical/nawk/dist PR/53885: Martijn Dekker: Add ...



details:   https://anonhg.NetBSD.org/src/rev/d8880e93c085
branches:  trunk
changeset: 996073:d8880e93c085
user:      christos <christos%NetBSD.org@localhost>
date:      Sat Jan 19 00:37:41 2019 +0000

description:
PR/53885: Martijn Dekker: Add ERE support from
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz

diffstat:

 external/historical/nawk/dist/b.c |  244 +++++++++++++++++++++++++++++++++++++-
 1 files changed, 242 insertions(+), 2 deletions(-)

diffs (truncated from 344 to 300 lines):

diff -r a423b2ae8582 -r d8880e93c085 external/historical/nawk/dist/b.c
--- a/external/historical/nawk/dist/b.c Sat Jan 19 00:16:43 2019 +0000
+++ b/external/historical/nawk/dist/b.c Sat Jan 19 00:37:41 2019 +0000
@@ -70,6 +70,11 @@
 static const uschar    *rlxstr;
 static const uschar    *prestr;        /* current position in current re */
 static const uschar    *lastre;        /* origin of last re */
+static const uschar    *lastatom;      /* origin of last Atom */
+static const uschar    *starttok;
+static const uschar    *basestr;       /* starts with original, replaced during
+                                          repetition processing */
+static const uschar    *firstbasestr;
 
 static int setcnt;
 static int poscnt;
@@ -176,6 +181,8 @@
        Node *p, *p1;
        fa *f;
 
+       firstbasestr = s;
+       basestr = firstbasestr;
        p = reparse(s);
        p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
                /* put ALL STAR in front of reg.  exp. */
@@ -198,6 +205,10 @@
        f->initstat = makeinit(f, anchor);
        f->anchor = anchor;
        f->restr = (uschar *) tostring(s);
+       if (firstbasestr != basestr) {
+               if (basestr)
+                       free(__UNCONST(basestr));
+       }
        return f;
 }
 
@@ -740,9 +751,11 @@
 Node *primary(void)
 {
        Node *np;
+       int savelastatom;
 
        switch (rtok) {
        case CHAR:
+               lastatom = starttok;
                np = op2(CHAR, NIL, itonp(rlxval));
                rtok = relex();
                return (unary(np));
@@ -751,16 +764,19 @@
                return (unary(op2(ALL, NIL, NIL)));
        case EMPTYRE:
                rtok = relex();
-               return (unary(op2(ALL, NIL, NIL)));
+               return (unary(op2(EMPTYRE, NIL, NIL)));
        case DOT:
+               lastatom = starttok;
                rtok = relex();
                return (unary(op2(DOT, NIL, NIL)));
        case CCL:
                np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
+               lastatom = starttok;
                rtok = relex();
                return (unary(np));
        case NCCL:
                np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
+               lastatom = starttok;
                rtok = relex();
                return (unary(np));
        case '^':
@@ -770,6 +786,8 @@
                rtok = relex();
                return (unary(op2(CHAR, NIL, NIL)));
        case '(':
+               lastatom = starttok;
+               savelastatom = starttok - basestr; /* Retain over recursion */
                rtok = relex();
                if (rtok == ')') {      /* special pleading for () */
                        rtok = relex();
@@ -777,6 +795,7 @@
                }
                np = regexp();
                if (rtok == ')') {
+                       lastatom = basestr + savelastatom; /* Restore */
                        rtok = relex();
                        return (unary(np));
                }
@@ -791,8 +810,12 @@
 Node *concat(Node *np)
 {
        switch (rtok) {
-       case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+       case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
                return (concat(op2(CAT, np, primary())));
+       case EMPTYRE:
+               rtok = relex();
+               return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+                               primary())));
        }
        return (np);
 }
@@ -869,6 +892,115 @@
        { NULL,         0,      NULL },
 };
 
+#define REPEAT_SIMPLE          0
+#define REPEAT_PLUS_APPENDED   1
+#define REPEAT_WITH_Q          2
+#define REPEAT_ZERO            3
+
+static int
+replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+              int atomlen, int firstnum, int secondnum, int special_case)
+{
+       int i, j;
+       uschar *buf = 0;
+       int ret = 1;
+       int init_q = (firstnum==0);             /* first added char will be ? */
+       int n_q_reps = secondnum-firstnum;      /* m>n, so reduce until {1,m-n} left  */ 
+       int prefix_length = reptok - basestr;   /* prefix includes first rep    */ 
+       int suffix_length = strlen(reptok) - reptoklen; /* string after rep specifier   */
+       int size = prefix_length +  suffix_length;
+
+       if (firstnum > 1) {     /* add room for reps 2 through firstnum */
+               size += atomlen*(firstnum-1);
+       }
+
+       /* Adjust size of buffer for special cases */
+       if (special_case == REPEAT_PLUS_APPENDED) {
+               size++;         /* for the final + */
+       } else if (special_case == REPEAT_WITH_Q) {
+               size += init_q + (atomlen+1)* n_q_reps;
+       } else if (special_case == REPEAT_ZERO) {
+               size += 2;      /* just a null ERE: () */
+       }
+       if ((buf = (uschar *) malloc(size+1)) == NULL)
+               FATAL("out of space in reg expr %.10s..", lastre);
+       memcpy(buf, basestr, prefix_length);    /* copy prefix  */ 
+       j = prefix_length;
+       if (special_case == REPEAT_ZERO) {
+               j -= atomlen;
+               buf[j++] = '(';
+               buf[j++] = ')';
+       }
+       for (i=1; i < firstnum; i++) {          /* copy x reps  */
+               memcpy(&buf[j], atom, atomlen);
+               j += atomlen;
+       }
+       if (special_case == REPEAT_PLUS_APPENDED) {
+               buf[j++] = '+';
+       } else if (special_case == REPEAT_WITH_Q) {
+               if (init_q) buf[j++] = '?';
+               for (i=0; i < n_q_reps; i++) {  /* copy x? reps */
+                       memcpy(&buf[j], atom, atomlen);
+                       j += atomlen;
+                       buf[j++] = '?';
+               }
+       }
+       memcpy(&buf[j], reptok+reptoklen, suffix_length);
+       if (special_case == REPEAT_ZERO) {
+               buf[j+suffix_length] = '\0';
+       } else {
+               buf[size] = '\0';
+       }
+       /* free old basestr */
+       if (firstbasestr != basestr) {
+               if (basestr)
+                       free(__UNCONST(basestr));
+       }
+       basestr = (char *)buf;
+       prestr  = buf + prefix_length;
+       if (special_case == REPEAT_ZERO) {
+               prestr  -= atomlen;
+               ret++;
+       }
+       return ret;
+}
+
+static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+                 int atomlen, int firstnum, int secondnum)
+{
+       /*
+          In general, the repetition specifier or "bound" is replaced here
+          by an equivalent ERE string, repeating the immediately previous atom
+          and appending ? and + as needed. Note that the first copy of the
+          atom is left in place, except in the special_case of a zero-repeat
+          (i.e., {0}).
+        */
+       if (secondnum < 0) {    /* means {n,} -> repeat n-1 times followed by PLUS */
+               if (firstnum < 2) {
+                       /* 0 or 1: should be handled before you get here */
+               } else {
+                       return replace_repeat(reptok, reptoklen, atom, atomlen, 
+                               firstnum, secondnum, REPEAT_PLUS_APPENDED);
+               }
+       } else if (firstnum == secondnum) {     /* {n} or {n,n} -> simply repeat n-1 times */
+               if (firstnum == 0) {    /* {0} or {0,0} */
+                       /* This case is unusual because the resulting 
+                          replacement string might actually be SMALLER than 
+                          the original ERE */
+                       return replace_repeat(reptok, reptoklen, atom, atomlen, 
+                                       firstnum, secondnum, REPEAT_ZERO);
+               } else {                /* (firstnum >= 1) */
+                       return replace_repeat(reptok, reptoklen, atom, atomlen, 
+                                       firstnum, secondnum, REPEAT_SIMPLE);
+               }
+       } else if (firstnum < secondnum) {      /* {n,m} -> repeat n-1 times then alternate  */
+               /*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?   */
+               return replace_repeat(reptok, reptoklen, atom, atomlen, 
+                                       firstnum, secondnum, REPEAT_WITH_Q);
+       } else {        /* Error - shouldn't be here (n>m) */
+       }
+       return 0;
+}
 
 int relex(void)                /* lexical analyzer for reparse */
 {
@@ -879,6 +1011,11 @@
        uschar *bp;
        const struct charclass *cc;
        int i;
+       int num, m, commafound, digitfound;
+       const uschar *startreptok;
+
+rescan:
+       starttok = prestr;
 
        switch (c = *prestr++) {
        case '|': return OR;
@@ -937,6 +1074,40 @@
                                        }
                                } else
                                        *bp++ = c;
+                       } else if (c == '[' && *prestr == '.') {
+                               char collate_char;
+                               prestr++;
+                               collate_char = *prestr++;
+                               if (*prestr == '.' && prestr[1] == ']') {
+                                       prestr += 2;
+                                       /* Found it: map via locale TBD: for
+                                          now, simply return this char.  This
+                                          is sufficient to pass conformance
+                                          test awk.ex 156
+                                        */
+                                       if (*prestr == ']') {
+                                               prestr++;
+                                               rlxval = collate_char;
+                                               return CHAR;
+                                       }
+                               }
+                       } else if (c == '[' && *prestr == '=') {
+                               char equiv_char;
+                               prestr++;
+                               equiv_char = *prestr++;
+                               if (*prestr == '=' && prestr[1] == ']') {
+                                       prestr += 2;
+                                       /* Found it: map via locale TBD: for now
+                                          simply return this char. This is 
+                                          sufficient to pass conformance test
+                                          awk.ex 156
+                                        */
+                                       if (*prestr == ']') {
+                                               prestr++;
+                                               rlxval = equiv_char;
+                                               return CHAR;
+                                       }
+                               }
                        } else if (c == '\0') {
                                FATAL("nonterminated character class %.20s", lastre);
                        } else if (bp == buf) { /* 1st char is special */
@@ -951,6 +1122,75 @@
                        } else
                                *bp++ = c;
                }
+               break;
+       case '{': 
+               if (isdigit(*(prestr))) {
+                       num = 0;        /* Process as a repetition */
+                       n = -1; m = -1;
+                       commafound = 0;
+                       digitfound = 0;
+                       startreptok = prestr-1;
+                       /* Remember start of previous atom here ? */
+               } else {                /* just a { char, not a repetition */
+                       rlxval = c;
+                       return CHAR;
+                }
+               for (; ; ) {
+                       if ((c = *prestr++) == '}') {
+                               if (commafound) {
+                                       if (digitfound) { /* {n,m} */
+                                               m = num;
+                                               if (m<n)
+                                                       FATAL("illegal repetition expression: class %.20s",
+                                                               lastre);
+                                               if ((n==0) && (m==1)) {
+                                                       return QUEST;
+                                               }
+                                       } else {        /* {n,} */
+                                               if (n==0) return STAR;
+                                               if (n==1) return PLUS;
+                                       }



Home | Main Index | Thread Index | Old Index