summaryrefslogtreecommitdiff
path: root/Src/pattern.c
diff options
context:
space:
mode:
Diffstat (limited to 'Src/pattern.c')
-rw-r--r--Src/pattern.c673
1 files changed, 484 insertions, 189 deletions
diff --git a/Src/pattern.c b/Src/pattern.c
index 13e6ca250..04452187d 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -47,6 +47,26 @@
*
* Eagle-eyed readers will notice this is an altered version. Incredibly
* sharp-eyed readers might even find bits that weren't altered.
+ *
+ *
+ * And I experienced a sense that, like certain regular
+ * expressions, seemed to match the day from beginning to end, so
+ * that I did not need to identify the parenthesised subexpression
+ * that told of dawn, nor the group of characters that indicated
+ * the moment when my grandfather returned home with news of
+ * Swann's departure for Paris; and the whole length of the month
+ * of May, as if matched by a closure, fitted into the buffer of my
+ * life with no sign of overflowing, turning the days, like a
+ * procession of insects that could consist of this or that
+ * species, into a random and unstructured repetition of different
+ * sequences, anchored from the first day of the month to the last
+ * in the same fashion as the weeks when I knew I would not see
+ * Gilberte and would search in vain for any occurrences of the
+ * string in the avenue of hawthorns by Tansonville, without my
+ * having to delimit explicitly the start or finish of the pattern.
+ *
+ * M. Proust, "In Search of Lost Files",
+ * bk I, "The Walk by Bourne's Place".
*/
#include "zsh.mdh"
@@ -78,7 +98,7 @@ typedef union upat *Upat;
#define P_EXCSYNC 0x01 /* no Test if following exclude already failed */
#define P_EXCEND 0x02 /* no Test if exclude matched orig branch */
#define P_BACK 0x03 /* no Match "", "next" ptr points backward. */
-#define P_EXACTLY 0x04 /* str Match this string. */
+#define P_EXACTLY 0x04 /* lstr Match this string. */
#define P_NOTHING 0x05 /* no Match empty string. */
#define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */
#define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */
@@ -103,10 +123,14 @@ typedef union upat *Upat;
/* spaces left for P_OPEN+n,... for backreferences */
#define P_OPEN 0x80 /* no Mark this point in input as start of n. */
#define P_CLOSE 0x90 /* no Analogous to OPEN. */
-/* zl is the range type zrange_t: may be zlong or unsigned long
- * char is a single char
- * uc* is a pointer to unsigned char, used at run time and initialised
+/*
+ * no no argument
+ * zr the range type zrange_t: may be zlong or unsigned long
+ * char a single char
+ * uc* a pointer to unsigned char, used at run time and initialised
* to NULL.
+ * str null-terminated, metafied string
+ * lstr length as long then string, not null-terminated, unmetafied.
*/
/*
@@ -179,16 +203,24 @@ typedef union upat *Upat;
#define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30)
#define P_NOTDOT(p) ((p)->l & 0x40)
+/* Specific to lstr type, i.e. P_EXACTLY. */
+#define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */
+#define P_LS_STR(p) ((char *)((p) + 2))
+
/* Flags needed when pattern is executed */
#define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */
#define P_HSTART 0x02 /* Starts with # or ##'d pattern. */
#define P_PURESTR 0x04 /* Can be matched with a strcmp */
-/* Next character after one which may be a Meta (x is any char *) */
-#define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1)
/*
* Increment pointer which may be on a Meta (x is a pointer variable),
* returning the incremented value (i.e. like pre-increment).
+ *
+ * In future the following will need to refer to metafied multibyte
+ * characters. References to invidual characters are not turned
+ * into a macro when the characters is metafied (c.f. CHARREF()
+ * below then the character is not metafied) and will need tracking
+ * down.
*/
#define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1)
/*
@@ -254,13 +286,18 @@ static int patglobflags; /* globbing flags & approx */
/* Add n more characters, ensuring there is enough space. */
+enum {
+ PA_NOALIGN = 1,
+ PA_UNMETA = 2
+};
+
/**/
static void
-patadd(char *add, int ch, long n, int noalgn)
+patadd(char *add, int ch, long n, int paflags)
{
- /* Make sure everything gets aligned unless we get noalgn. */
+ /* Make sure everything gets aligned unless we get PA_NOALIGN. */
long newpatsize = patsize + n;
- if (!noalgn)
+ if (!(paflags & PA_NOALIGN))
newpatsize = (newpatsize + sizeof(union upat) - 1) &
~(sizeof(union upat) - 1);
if (patalloc < newpatsize) {
@@ -272,8 +309,25 @@ patadd(char *add, int ch, long n, int noalgn)
}
patsize = newpatsize;
if (add) {
- while (n--)
- *patcode++ = *add++;
+ if (paflags & PA_UNMETA) {
+ /*
+ * Unmetafy and untokenize the string as we go.
+ * The Meta characters in add aren't counted in n.
+ */
+ while (n--) {
+ if (itok(*add))
+ *patcode++ = ztokens[*add++ - Pound];
+ else if (*add == Meta) {
+ add++;
+ *patcode++ = *add++ ^ 32;
+ } else {
+ *patcode++ = *add++;
+ }
+ }
+ } else {
+ while (n--)
+ *patcode++ = *add++;
+ }
} else
*patcode++ = ch;
patcode = patout + patsize;
@@ -297,13 +351,22 @@ patcompstart(void)
patglobflags = GF_IGNCASE;
}
-/* Top level pattern compilation subroutine */
+/*
+ * Top level pattern compilation subroutine
+ * exp is a null-terminated, metafied string.
+ * inflags is an or of some PAT_* flags.
+ * endexp, if non-null, is set to a pointer to the end of the
+ * part of exp which was compiled. This is used when
+ * compiling patterns for directories which must be
+ * matched recursively.
+ */
/**/
mod_export Patprog
patcompile(char *exp, int inflags, char **endexp)
{
- int flags = 0, len = 0;
+ int flags = 0;
+ long len = 0;
long startoff;
Upat pscan;
char *lng, *strp = NULL;
@@ -324,7 +387,7 @@ patcompile(char *exp, int inflags, char **endexp)
* in struct is actual count of parentheses.
*/
patnpar = 1;
- patflags = inflags & ~PAT_PURES;
+ patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP);
patendseg = endseg;
patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM;
@@ -366,7 +429,12 @@ patcompile(char *exp, int inflags, char **endexp)
if (patcompswitch(0, &flags) == 0)
return NULL;
} else {
- /* Yes, copy the string and skip compilation altogether */
+ /*
+ * Yes, copy the string, and skip compilation altogether.
+ * Null terminate for the benefit of globbing.
+ * Leave metafied both for globbing and for our own
+ * efficiency.
+ */
patparse = strp;
len = strp - exp;
patadd(exp, 0, len + 1, 0);
@@ -404,19 +472,52 @@ patcompile(char *exp, int inflags, char **endexp)
for (; pscan; pscan = next) {
next = PATNEXT(pscan);
if (P_OP(pscan) == P_EXACTLY) {
- char *opnd = (char *)P_OPERAND(pscan);
- while ((*dst = *opnd++))
- dst++;
+ char *opnd = P_LS_STR(pscan), *mtest;
+ long oplen = P_LS_LEN(pscan), ilen;
+ int nmeta = 0;
+ /*
+ * Unfortunately we unmetafied the string
+ * and we need to put any metacharacters
+ * back now we know it's a pure string.
+ * This shouldn't happen too often, it's
+ * just that there are some cases such
+ * as . and .. in files where we really
+ * need a pure string even if there are
+ * pattern characters flying around.
+ */
+ for (mtest = opnd, ilen = oplen; ilen;
+ mtest++, ilen--)
+ if (imeta(*mtest))
+ nmeta++;
+ if (nmeta) {
+ char *oldpatout = patout;
+ patadd(NULL, 0, nmeta, 0);
+ /*
+ * Yuk.
+ */
+ p = (Patprog)patout;
+ opnd = patout + (opnd - oldpatout);
+ dst = patout + startoff;
+ }
+
+ while (oplen--) {
+ if (imeta(*opnd)) {
+ *dst++ = Meta;
+ *dst++ = *opnd ^ 32;
+ } else {
+ *dst++ = *opnd++;
+ }
+ }
}
}
- *dst++ = '\0';
p->size = dst - patout;
- /* patmlen is really strlen, don't include null byte */
- p->patmlen = p->size - startoff - 1;
+ /* patmlen is really strlen. We don't need a null. */
+ p->patmlen = p->size - startoff;
} else {
/* starting point info */
- if (P_OP(pscan) == P_EXACTLY && !p->globflags)
- p->patstartch = *(char *)P_OPERAND(pscan);
+ if (P_OP(pscan) == P_EXACTLY && !p->globflags &&
+ P_LS_LEN(pscan))
+ p->patstartch = *P_LS_STR(pscan);
/*
* Find the longest literal string in something expensive.
* This is itself not all that cheap if we have
@@ -427,9 +528,9 @@ patcompile(char *exp, int inflags, char **endexp)
len = 0;
for (; pscan; pscan = PATNEXT(pscan))
if (P_OP(pscan) == P_EXACTLY &&
- (int)strlen((char *)P_OPERAND(pscan)) >= len) {
- lng = (char *)P_OPERAND(pscan);
- len = strlen(lng);
+ P_LS_LEN(pscan) >= len) {
+ lng = P_LS_STR(pscan);
+ len = P_LS_LEN(pscan);
}
if (lng) {
p->mustoff = lng - patout;
@@ -844,9 +945,9 @@ static long
patcomppiece(int *flagp)
{
long starter = 0, next, pound, op;
- int flags, flags2, kshchar, len, ch, patch;
+ int flags, flags2, kshchar, len, ch, patch, nmeta;
union upat up;
- char *nptr, *str0, cbuf[2];
+ char *nptr, *str0, *ptr, cbuf[2];
zrange_t from, to;
flags = 0;
@@ -881,6 +982,9 @@ patcomppiece(int *flagp)
}
if (patparse > str0) {
+ long slen = patparse - str0;
+ int morelen;
+
/* Ordinary string: cancel kshchar lookahead */
kshchar = '\0';
/*
@@ -889,25 +993,40 @@ patcomppiece(int *flagp)
flags |= P_PURESTR;
DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece.");
/* more than one character matched? */
- len = str0 + (*str0 == Meta ? 2 : 1) < patparse;
+ morelen = str0 + (*str0 == Meta ? 2 : 1) < patparse;
/*
* If we have more than one character, a following hash only
* applies to the last, so decrement.
*/
- if (isset(EXTENDEDGLOB) && *patparse == Pound && len)
+ if (isset(EXTENDEDGLOB) && *patparse == Pound && morelen)
patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1;
/*
* If len is 1, we can't have an active # following, so doesn't
* matter that we don't make X in `XX#' simple.
*/
- if (!len)
+ if (!morelen)
flags |= P_SIMPLE;
starter = patnode(P_EXACTLY);
- /* add enough space including null byte */
- len = patparse - str0;
- patadd(str0, 0, len + 1, 0);
- nptr = (char *)P_OPERAND((Upat)patout + starter);
- nptr[len] = '\0';
+
+ /* Get length of string without metafication. */
+ nmeta = 0;
+ for (ptr = str0; ptr < patparse; ptr++) {
+ if (*ptr == Meta) {
+ nmeta++;
+ ptr++;
+ }
+ }
+ slen = (patparse - str0) - nmeta;
+ /* First add length, which is a long */
+ patadd((char *)&slen, 0, sizeof(long), 0);
+ /*
+ * Then the string, not null terminated.
+ * Unmetafy and untokenize; pass the final length,
+ * which is what we need to allocate, i.e. not including
+ * a count for each Meta in the string.
+ */
+ patadd(str0, 0, slen, PA_UNMETA);
+ nptr = P_LS_STR((Upat)patout + starter);
/*
* It's much simpler to turn off pure string mode for
* any case-insensitive or approximate matching; usually,
@@ -918,13 +1037,13 @@ patcomppiece(int *flagp)
* ..(#a1).. (i.e. the (#a1) has no effect), but if you're
* going to write funny patterns, you get no sympathy from me.
*/
- if (patglobflags &&
- (!(patflags & PAT_FILE) || (strcmp(nptr, ".") &&
- strcmp(nptr, ".."))))
- flags &= ~P_PURESTR;
- for (; *nptr; METAINC(nptr))
- if (itok(*nptr))
- *nptr = ztokens[*nptr - Pound];
+ if (patglobflags) {
+ if (!(patflags & PAT_FILE))
+ flags &= ~P_PURESTR;
+ else if (!(nptr[0] == '.' &&
+ (slen == 1 || (nptr[1] == '.' && slen == 2))))
+ flags &= ~P_PURESTR;
+ }
} else {
if (kshchar)
patparse++;
@@ -950,7 +1069,7 @@ patcomppiece(int *flagp)
starter = patnode(P_ANYOF);
if (*patparse == Outbrack) {
patparse++;
- patadd(NULL, ']', 1, 1);
+ patadd(NULL, ']', 1, PA_NOALIGN);
}
while (*patparse && *patparse != Outbrack) {
/* Meta is not a token */
@@ -990,7 +1109,7 @@ patcomppiece(int *flagp)
ch = PP_UNKWN;
patparse = nptr + 2;
if (ch != PP_UNKWN)
- patadd(NULL, STOUC(Meta+ch), 1, 1);
+ patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
continue;
}
if (itok(*patparse)) {
@@ -1003,15 +1122,17 @@ patcomppiece(int *flagp)
patparse++;
if (*patparse == '-' && patparse[1] != Outbrack) {
- patadd(NULL, STOUC(Meta+PP_RANGE), 1, 1);
- patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1);
+ patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
+ patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
if (itok(*++patparse)) {
- patadd(0, STOUC(ztokens[*patparse - Pound]), 1, 1);
+ patadd(0, STOUC(ztokens[*patparse - Pound]), 1,
+ PA_NOALIGN);
} else
- patadd(patparse, 0, (*patparse == Meta) ? 2 : 1, 1);
+ patadd(patparse, 0, (*patparse == Meta) ? 2 : 1,
+ PA_NOALIGN);
METAINC(patparse);
} else
- patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1);
+ patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
}
if (*patparse != Outbrack)
return 0;
@@ -1309,19 +1430,11 @@ static void patoptail(long p, long val)
*/
static char *patinstart; /* Start of input string */
static char *patinend; /* End of input string */
+static char *patinput; /* String input pointer */
static char *patinpath; /* Full path for use with ~ exclusions */
-
-/**/
-char *patinput; /* String input pointer */
-
-/*
- * Offset of string at which we are trying to match.
- * This is added in to the positions recorded in patbeginp and patendp
- * when we are looking for substrings. Currently this only happens
- * in the parameter substitution code.
- */
-/**/
-int patoffset;
+static int patinlen; /* Length of last successful match.
+ * Includes count of Meta characters.
+ */
static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */
static char *patendp[NSUBEXP]; /* Pointer to backref ends */
@@ -1330,8 +1443,23 @@ static int parsfound; /* parentheses (with backrefs) found */
static int globdots; /* Glob initial dots? */
/*
+ * Macros which are currently trivial but are likely to be less
+ * so when we handle multibyte characters. They operate on
+ * umetafied strings.
+ */
+
+/* Get a character from the start point in a string */
+#define CHARREF(x) (STOUC(*x))
+/* Get a pointer to the next character */
+#define CHARNEXT(x) (x+1)
+/* Increment a pointer past the current character. */
+#define CHARINC(x) (x++)
+/* Counter the number of characters between two pointers, largest first */
+#define CHARSUB(x,y) (x-y)
+
+/*
* The following need to be accessed in the globbing scanner for
- * a multi-component file path. See horror story there.
+ * a multi-component file path. See horror story in glob.c.
*/
/**/
int errsfound; /* Total error count so far */
@@ -1347,23 +1475,59 @@ pattrystart(void)
errsfound = 0;
}
+/*
+ * Test prog against null-terminated, metafied string.
+ */
+
/**/
mod_export int
pattry(Patprog prog, char *string)
{
- return pattryrefs(prog, string, NULL, NULL, NULL);
+ return pattryrefs(prog, string, -1, 0, NULL, NULL, NULL);
+}
+
+/*
+ * Test prog against string of given length, no null termination
+ * but still metafied at this point. offset gives an offset
+ * to include in reported match indices
+ */
+
+/**/
+mod_export int
+pattrylen(Patprog prog, char *string, int len, int offset)
+{
+ return pattryrefs(prog, string, len, offset, NULL, NULL, NULL);
}
-/* The last three arguments are used to report the positions for the
+/*
+ * Test prog against string with given length stringlen, which
+ * may be -1 to indicate a null-terminated string. The input
+ * string is metafied; the length is the raw string length, not the
+ * number of possibly metafied characters.
+ *
+ * offset is the position in the original string (not seen by
+ * the patter module) at which we are trying to match.
+ * This is added in to the positions recorded in patbeginp and patendp
+ * when we are looking for substrings. Currently this only happens
+ * in the parameter substitution code.
+ *
+ * Note this is a character offset, i.e. a metafied character
+ * counts as 1.
+ *
+ * The last three arguments are used to report the positions for the
* backreferences. On entry, *nump should contain the maximum number
- * positions to report. */
+ * of positions to report. In this case the match, mbegin, mend
+ * arrays are not altered.
+ */
/**/
mod_export int
-pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
+pattryrefs(Patprog prog, char *string, int stringlen, int patoffset,
+ int *nump, int *begp, int *endp)
{
- int i, maxnpos = 0, ret;
- char **sp, **ep;
+ int i, maxnpos = 0, ret, needfullpath, unmetalen, unmetalenp;
+ int origlen;
+ char **sp, **ep, *tryalloced, *ptr;
char *progstr = (char *)prog + prog->startoff;
if (nump) {
@@ -1374,12 +1538,87 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
if (*string == Nularg)
string++;
- patinstart = patinput = string;
- patinend = patinstart + strlen(patinstart);
+ if (stringlen < 0)
+ stringlen = strlen(string);
+ origlen = stringlen;
+
+ patflags = prog->flags;
+ /*
+ * For a top-level ~-exclusion, we will need the full
+ * path to exclude, so copy the path so far and append the
+ * current test string.
+ */
+ needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos;
+
+ /* Get the length of the full string when unmetafied. */
+ unmetalen = ztrsub(string + stringlen, string);
+ if (needfullpath)
+ unmetalenp = ztrsub(pathbuf + pathpos, pathbuf);
+ else
+ unmetalenp = 0;
+
+ DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)),
+ "rum sort of file exclusion");
+ /*
+ * Partly for efficiency, and partly for the convenience of
+ * globbing, we don't unmetafy pure string patterns, and
+ * there's no reason to if the pattern is just a *.
+ */
+ if (!(patflags & (PAT_PURES|PAT_ANY))
+ && (needfullpath || unmetalen != stringlen)) {
+ /*
+ * We need to copy if we need to prepend the path so far
+ * (in which case we copy both chunks), or if we have
+ * Meta characters.
+ */
+ char *dst;
+ int icopy, ncopy;
+
+ dst = tryalloced = zalloc(unmetalen + unmetalenp);
+
+ if (needfullpath) {
+ /* loop twice, copy path buffer first time */
+ ptr = pathbuf;
+ ncopy = unmetalenp;
+ } else {
+ /* just loop once, copy string with unmetafication */
+ ptr = string;
+ ncopy = unmetalen;
+ }
+ for (icopy = 0; icopy < 2; icopy++) {
+ for (i = 0; i < ncopy; i++) {
+ if (*ptr == Meta) {
+ ptr++;
+ *dst++ = *ptr++ ^ 32;
+ } else {
+ *dst++ = *ptr++;
+ }
+ }
+ if (!needfullpath)
+ break;
+ /* next time append test string to path so far */
+ ptr = string;
+ ncopy = unmetalen;
+ }
+
+ if (needfullpath) {
+ patinstart = tryalloced + unmetalenp;
+ patinpath = tryalloced;
+ } else {
+ patinstart = tryalloced;
+ patinpath = NULL;
+ }
+ stringlen = unmetalen;
+ } else {
+ patinstart = string;
+ tryalloced = patinpath = NULL;
+ }
+
+ patinend = patinstart + stringlen;
/*
* From now on we do not require NULL termination of
- * the test string. It is still metafied, as is string
- * data in the prog.
+ * the test string. There should also be no more references
+ * to the variable string.
*/
if (prog->flags & (PAT_PURES|PAT_ANY)) {
@@ -1399,11 +1638,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* Testing a pure string. See if initial
* components match.
*/
- int lendiff = (patinend - patinstart) - prog->patmlen;
+ int lendiff = stringlen - prog->patmlen;
if (lendiff < 0) {
/* No, the pattern string is too long. */
ret = 0;
- } else if (!memcmp(progstr, string, prog->patmlen)) {
+ } else if (!memcmp(progstr, patinstart, prog->patmlen)) {
/*
* Initial component matches. Matches either
* if lengths are the same or we are not anchored
@@ -1420,47 +1659,61 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* For files, we won't match initial "."s unless
* glob_dots is set.
*/
- if ((prog->flags & PAT_NOGLD) && *string == '.')
- return 0;
- /* in case used for ${..#..} etc. */
- patinput = string + prog->patmlen;
- /* if matching files, must update globbing flags */
- patglobflags = prog->globend;
- return 1;
- } else
- return 0;
+ if ((prog->flags & PAT_NOGLD) && *patinstart == '.') {
+ ret = 0;
+ } else {
+ /*
+ * Remember the length in case used for ${..#..} etc.
+ * In this case, we didn't unmetafy the string.
+ */
+ patinlen = (int)prog->patmlen;
+ /* if matching files, must update globbing flags */
+ patglobflags = prog->globend;
+ }
+ }
+
+ if (tryalloced)
+ zfree(tryalloced, unmetalen + unmetalenp);
+
+ return ret;
} else {
/*
* Test for a `must match' string, unless we're scanning for a match
* in which case we don't need to do this each time.
*/
+ ret = 1;
if (!(prog->flags & PAT_SCAN) && prog->mustoff)
{
char *testptr; /* start pointer into test string */
char *teststop; /* last point from which we can match */
char *patptr = (char *)prog + prog->mustoff;
- int patlen = strlen(patptr);
+ int patlen = prog->patmlen;
int found = 0;
- if (patlen > patinend - patinstart) {
+ if (patlen > stringlen) {
/* Too long, can't match. */
- return 0;
- }
- teststop = patinend - patlen;
+ ret = 0;
+ } else {
+ teststop = patinend - patlen;
- for (testptr = patinstart; testptr <= teststop; testptr++)
- {
- if (!memcmp(testptr, patptr, patlen)) {
- found = 1;
- break;
+ for (testptr = patinstart; testptr <= teststop; testptr++)
+ {
+ if (!memcmp(testptr, patptr, patlen)) {
+ found = 1;
+ break;
+ }
}
- }
- if (!found)
- return 0;
+ if (!found)
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ if (tryalloced)
+ zfree(tryalloced, unmetalen + unmetalenp);
+ return 0;
}
- patflags = prog->flags;
patglobflags = prog->globflags;
if (!(patflags & PAT_FILE)) {
forceerrs = -1;
@@ -1469,26 +1722,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
globdots = !(patflags & PAT_NOGLD);
parsfound = 0;
- if ((patflags & PAT_HAS_EXCLUDP) && pathpos) {
- /*
- * For a top-level ~-exclusion, we will need the full
- * path to exclude, so copy the path so far and append the
- * current test string.
- *
- * There are some advantages in making patinstart etc.
- * point into this new string; however, that gets confusing
- * if we need patinput outside this file. That's
- * not likely for files but I don't think it's worth
- * the risk.
- */
- int len = patinend - patinstart;
-
- patinpath = (char *)zalloc(pathpos + len);
- memcpy(patinpath, pathbuf, pathpos);
- memcpy(patinpath + pathpos, patinstart, len);
- } else {
- patinpath = NULL;
- }
+ patinput = patinstart;
if (patmatch((Upat)progstr)) {
/*
@@ -1496,6 +1730,23 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* failed, so set it now
*/
patglobflags = prog->globend;
+
+ /*
+ * Record length of successful match, including Meta
+ * characters. Do it here so that patmatchlen() can return
+ * it even if we delete the pattern strings.
+ */
+ patinlen = patinput - patinstart;
+ /*
+ * Optimization: if we didn't find any Meta characters
+ * to begin with, we don't need to look for them now.
+ */
+ if (unmetalen != origlen) {
+ for (ptr = patinstart; ptr < patinput; ptr++)
+ if (imeta(*ptr))
+ patinlen++;
+ }
+
/*
* Should we clear backreferences and matches on a failed
* match?
@@ -1504,15 +1755,18 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
/*
* m flag: for global match. This carries no overhead
* in the pattern matching part.
+ *
+ * Remember the test pattern is already unmetafied.
*/
char *str;
- int mlen = ztrsub(patinput, patinstart);
+ int mlen = CHARSUB(patinput, patinstart);
- str = ztrduppfx(patinstart, patinput - patinstart);
+ str = metafy(patinstart, patinput - patinstart, META_DUP);
setsparam("MATCH", str);
setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS)));
setiparam("MEND",
- (zlong)(mlen + patoffset + !isset(KSHARRAYS) - 1));
+ (zlong)(mlen + patoffset +
+ !isset(KSHARRAYS) - 1));
}
if (prog->patnpar && nump) {
/*
@@ -1527,9 +1781,10 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
for (i = 0; i < prog->patnpar && i < maxnpos; i++) {
if (parsfound & (1 << i)) {
if (begp)
- *begp++ = ztrsub(*sp, patinstart) + patoffset;
+ *begp++ = CHARSUB(*sp, patinstart) + patoffset;
if (endp)
- *endp++ = ztrsub(*ep, patinstart) + patoffset - 1;
+ *endp++ = CHARSUB(*ep, patinstart) + patoffset
+ - 1;
} else {
if (begp)
*begp++ = -1;
@@ -1557,7 +1812,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
for (i = 0; i < prog->patnpar; i++) {
if (parsfound & (1 << i)) {
- matcharr[i] = ztrduppfx(*sp, *ep - *sp);
+ matcharr[i] = metafy(*sp, *ep - *sp, META_DUP);
/*
* mbegin and mend give indexes into the string
* in the standard notation, i.e. respecting
@@ -1568,12 +1823,12 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* corresponds to indexing as ${foo[1,1]}.
*/
sprintf(numbuf, "%ld",
- (long)(ztrsub(*sp, patinstart) +
+ (long)(CHARSUB(*sp, patinstart) +
patoffset +
!isset(KSHARRAYS)));
mbeginarr[i] = ztrdup(numbuf);
sprintf(numbuf, "%ld",
- (long)(ztrsub(*ep, patinstart) +
+ (long)(CHARSUB(*ep, patinstart) +
patoffset +
!isset(KSHARRAYS) - 1));
mendarr[i] = ztrdup(numbuf);
@@ -1593,20 +1848,32 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
setaparam("mbegin", mbeginarr);
setaparam("mend", mendarr);
}
+
ret = 1;
} else
ret = 0;
- if (patinpath) {
- zfree(patinpath, pathpos + (patinend - patinstart));
- patinpath = NULL;
- }
+ if (tryalloced)
+ zfree(tryalloced, unmetalen + unmetalenp);
return ret;
}
}
/*
+ * Return length of previous succesful match. This is
+ * in metafied bytes, i.e. includes a count of Meta characters.
+ * Unusual and futile attempt at modular encapsulation.
+ */
+
+/**/
+int
+patmatchlen(void)
+{
+ return patinlen;
+}
+
+/*
* Match literal characters with case insensitivity test: the first
* comes from the input string, the second the current pattern.
*/
@@ -1627,8 +1894,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* exactpos is used to remember how far down an exact string we have
* matched, if we are doing approximation and can therefore redo from
* the same point; we never need to otherwise.
+ *
+ * exactend is a pointer to the end of the string, which isn't
+ * null-terminated.
*/
-static char *exactpos;
+static char *exactpos, *exactend;
/*
* Main matching routine.
@@ -1643,7 +1913,7 @@ patmatch(Upat prog)
{
/* Current and next nodes */
Upat scan = prog, next, opnd;
- char *start, *save, *chrop, *compend;
+ char *start, *save, *chrop, *chrend, *compend;
int savglobflags, op, no, min, nextch, fail = 0, saverrsfound;
zrange_t from, to, comp;
@@ -1659,45 +1929,52 @@ patmatch(Upat prog)
if (patinput == patinend)
fail = 1;
else
- METAINC(patinput);
+ CHARINC(patinput);
break;
case P_EXACTLY:
/*
* acts as nothing if *chrop is null: this is used by
* approx code.
*/
- chrop = exactpos ? exactpos : (char *)P_OPERAND(scan);
+ if (exactpos) {
+ chrop = exactpos;
+ chrend = exactend;
+ } else {
+ chrop = P_LS_STR(scan);
+ chrend = chrop + P_LS_LEN(scan);
+ }
exactpos = NULL;
- while (*chrop && patinput < patinend) {
- int chin = STOUC(UNMETA(patinput));
- int chpa = STOUC(UNMETA(chrop));
+ while (chrop < chrend && patinput < patinend) {
+ int chin = CHARREF(patinput);
+ int chpa = CHARREF(chrop);
if (!CHARMATCH(chin, chpa)) {
fail = 1;
break;
}
- METAINC(chrop);
- METAINC(patinput);
+ CHARINC(chrop);
+ CHARINC(patinput);
}
- if (*chrop) {
+ if (chrop < chrend) {
exactpos = chrop;
+ exactend = chrend;
fail = 1;
}
break;
case P_ANYOF:
if (patinput == patinend ||
!patmatchrange((char *)P_OPERAND(scan),
- STOUC(UNMETA(patinput))))
+ CHARREF(patinput)))
fail = 1;
else
- METAINC(patinput);
+ CHARINC(patinput);
break;
case P_ANYBUT:
if (patinput == patinend ||
patmatchrange((char *)P_OPERAND(scan),
- STOUC(UNMETA(patinput))))
+ CHARREF(patinput)))
fail = 1;
else
- METAINC(patinput);
+ CHARINC(patinput);
break;
case P_NUMRNG:
case P_NUMFROM:
@@ -1771,7 +2048,8 @@ patmatch(Upat prog)
return 1;
}
if (!no && P_OP(next) == P_EXACTLY &&
- !idigit(STOUC(*(char *)P_OPERAND(next))) &&
+ (!P_LS_LEN(next) ||
+ !idigit(STOUC(*P_LS_STR(next)))) &&
!(patglobflags & 0xff))
return 0;
patinput = --save;
@@ -1791,7 +2069,7 @@ patmatch(Upat prog)
case P_NUMANY:
/* This is <->: any old set of digits, don't bother comparing */
start = patinput;
- while (patinput < patinend && idigit(STOUC(*patinput)))
+ while (patinput < patinend && idigit(CHARREF(patinput)))
patinput++;
save = patinput;
no = 0;
@@ -1799,7 +2077,8 @@ patmatch(Upat prog)
if (patmatch(next))
return 1;
if (!no && P_OP(next) == P_EXACTLY &&
- !idigit(STOUC(*(char *)P_OPERAND(next))) &&
+ (!P_LS_LEN(next) ||
+ !idigit(CHARREF(P_LS_STR(next)))) &&
!(patglobflags & 0xff))
return 0;
patinput = --save;
@@ -1963,7 +2242,7 @@ patmatch(Upat prog)
origpatinend = patinend;
while ((ret = patmatch(P_OPERAND(scan)))) {
unsigned char *syncpt;
- char *savpatinstart, *savpatinend;
+ char *savpatinstart;
int savforce = forceerrs;
int savpatflags = patflags, synclen;
forceerrs = -1;
@@ -1998,7 +2277,6 @@ patmatch(Upat prog)
patflags |= PAT_NOTEND;
}
savpatinstart = patinstart;
- savpatinend = patinend;
next = PATNEXT(scan);
while (next && P_ISEXCLUDE(next)) {
patinput = save;
@@ -2013,14 +2291,15 @@ patmatch(Upat prog)
opnd = P_OPERAND(next) + 1;
if (P_OP(next) == P_EXCLUDP && patinpath) {
/*
- * top level exclusion with a file,
+ * Top level exclusion with a file,
* applies to whole path so add the
- * segments already matched
+ * segments already matched.
+ * We copied these in front of the
+ * test pattern, so patinend doesn't
+ * need moving.
*/
DPUTS(patinput != patinstart,
"BUG: not at start excluding path");
- patinend = patinpath + pathpos +
- (patinend - patinstart);
patinput = patinstart = patinpath;
}
if (patmatch(opnd)) {
@@ -2036,7 +2315,6 @@ patmatch(Upat prog)
patinput = savpatinstart +
(patinput - patinstart);
patinstart = savpatinstart;
- patinend = savpatinend;
}
if (!ret)
break;
@@ -2146,7 +2424,7 @@ patmatch(Upat prog)
/* Note that no counts possibly metafied characters */
start = patinput;
if (op == P_STAR) {
- for (no = 0; patinput < patinend; METAINC(patinput))
+ for (no = 0; patinput < patinend; CHARINC(patinput))
no++;
/* simple optimization for reasonably common case */
if (P_OP(next) == P_END)
@@ -2156,7 +2434,7 @@ patmatch(Upat prog)
"BUG: wrong backtracking with approximation.");
if (!globdots && P_NOTDOT(P_OPERAND(scan)) &&
patinput == patinstart && patinput < patinend &&
- *patinput == '.')
+ CHARREF(patinput) == '.')
return 0;
no = patrepeat(P_OPERAND(scan));
}
@@ -2165,8 +2443,9 @@ patmatch(Upat prog)
* Lookahead to avoid useless matches. This is not possible
* with approximation.
*/
- if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) {
- char *nextop = (char *)P_OPERAND(next);
+ if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) &&
+ !(patglobflags & 0xff)) {
+ char *nextop = P_LS_STR(next);
/*
* If that P_EXACTLY is last (common in simple patterns,
* such as *.c), then it can be only be matched at one
@@ -2175,21 +2454,20 @@ patmatch(Upat prog)
if (P_OP(PATNEXT(next)) == P_END &&
!(patflags & PAT_NOANCH)) {
int ptlen = patinend - patinput;
- int oplen = strlen(nextop);
- int lenmatch = patinend - (min ? METANEXT(start) : start);
+ int lenmatch = patinend - (min ? CHARNEXT(start) : start);
/* Are we in the right range? */
- if (oplen > lenmatch || oplen < ptlen)
+ if (P_LS_LEN(next) > lenmatch || P_LS_LEN(next) < ptlen)
return 0;
/* Yes, just position appropriately and test. */
- patinput += ptlen - oplen;
- if (patinput > start && patinput[-1] == Meta) {
- /* doesn't align properly, no go */
- return 0;
- }
+ patinput += ptlen - P_LS_LEN(next);
+ /*
+ * Here we will need to be careful that patinput is not
+ * in the middle of a multibyte character.
+ */
/* Continue loop with P_EXACTLY test. */
break;
}
- nextch = STOUC(UNMETA(nextop));
+ nextch = CHARREF(nextop);
} else
nextch = -1;
save = patinput;
@@ -2199,14 +2477,17 @@ patmatch(Upat prog)
int charmatch_cache;
if (nextch < 0 ||
(patinput < patinend &&
- CHARMATCH_EXPR(STOUC(UNMETA(patinput)), nextch))) {
+ CHARMATCH_EXPR(CHARREF(patinput), nextch))) {
if (patmatch(next))
return 1;
}
no--;
save--;
- if (save > start && save[-1] == Meta)
- save--;
+ /*
+ * Here we will need to make sure save is
+ * decremented properly to the start of
+ * the preceeding multibyte character.
+ */
patinput = save;
patglobflags = savglobflags;
errsfound = saverrsfound;
@@ -2270,7 +2551,7 @@ patmatch(Upat prog)
/* Try omitting a character from the input string */
if (patinput < patinend) {
- METAINC(patinput);
+ CHARINC(patinput);
/* If we are not on an exact match, then this is
* our last gasp effort, so we can optimize out
* the recursive call.
@@ -2285,11 +2566,11 @@ patmatch(Upat prog)
char *nextexact = savexact;
DPUTS(!savexact || !*savexact,
"BUG: exact match has not set exactpos");
- METAINC(nextexact);
+ CHARINC(nextexact);
if (save < patinend) {
char *nextin = save;
- METAINC(nextin);
+ CHARINC(nextin);
patglobflags = savglobflags;
errsfound = saverrsfound;
exactpos = savexact;
@@ -2299,18 +2580,18 @@ patmatch(Upat prog)
* exactpos
*/
if (save < patinend && nextin < patinend &&
- *nextexact) {
- int cin0 = UNMETA(save);
- int cpa0 = UNMETA(exactpos);
- int cin1 = UNMETA(nextin);
- int cpa1 = UNMETA(nextexact);
+ nextexact < exactend) {
+ int cin0 = CHARREF(save);
+ int cpa0 = CHARREF(exactpos);
+ int cin1 = CHARREF(nextin);
+ int cpa1 = CHARREF(nextexact);
if (CHARMATCH(cin0, cpa1) &&
CHARMATCH(cin1, cpa0)) {
patinput = nextin;
- METAINC(patinput);
+ CHARINC(patinput);
exactpos = nextexact;
- METAINC(exactpos);
+ CHARINC(exactpos);
if (patmatch(scan))
return 1;
@@ -2333,12 +2614,13 @@ patmatch(Upat prog)
exactpos = savexact;
}
+ DPUTS(exactpos == exactend, "approximating too far");
/*
* Try moving up the exact match pattern.
* This must be the last attempt, so just loop
* instead of calling recursively.
*/
- METAINC(exactpos);
+ CHARINC(exactpos);
continue;
}
}
@@ -2358,6 +2640,11 @@ patmatchrange(char *range, int ch)
{
int r1, r2;
+ /*
+ * Careful here: unlike other strings, range is a NULL-terminated,
+ * metafied string, because we need to treat the Posix and hyphenated
+ * ranges specially.
+ */
for (; *range; range++) {
if (imeta(STOUC(*range))) {
switch (STOUC(*range)-STOUC(Meta)) {
@@ -2459,23 +2746,24 @@ static int patrepeat(Upat p)
break;
#endif
case P_EXACTLY:
- tch = STOUC(UNMETA(opnd));
+ DPUTS(P_LS_LEN(p) != 1, "closure following more than one character");
+ tch = CHARREF(P_LS_STR(p));
while (scan < patinend &&
- CHARMATCH_EXPR(STOUC(UNMETA(scan)), tch)) {
+ CHARMATCH_EXPR(CHARREF(scan), tch)) {
count++;
- METAINC(scan);
+ CHARINC(scan);
}
break;
case P_ANYOF:
- while (scan < patinend && patmatchrange(opnd, STOUC(UNMETA(scan)))) {
+ while (scan < patinend && patmatchrange(opnd, CHARREF(scan))) {
count++;
- METAINC(scan);
+ CHARINC(scan);
}
break;
case P_ANYBUT:
- while (scan < patinend && !patmatchrange(opnd, STOUC(UNMETA(scan)))) {
+ while (scan < patinend && !patmatchrange(opnd, CHARREF(scan))) {
count++;
- METAINC(scan);
+ CHARINC(scan);
}
break;
#ifdef DEBUG
@@ -2528,7 +2816,14 @@ patdump(Patprog r)
next = PATNEXT(up);
printf("(%d)", next ? next-codestart : 0);
s += sizeof(union upat);
- if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) {
+ if (op == P_EXACTLY) {
+ long llen = *(long *)s;
+ s += sizeof(long);
+ while (llen--) {
+ putchar(CHARREF(s));
+ CHARINC(s);
+ }
+ } else if (op == P_ANYOF || op == P_ANYBUT) {
while (*s != '\0') {
if (itok(*s)) {
if (*s == Meta + PP_RANGE) {