summaryrefslogtreecommitdiff
path: root/Src
diff options
context:
space:
mode:
Diffstat (limited to 'Src')
-rw-r--r--Src/Zle/comp.h41
-rw-r--r--Src/Zle/complete.c188
-rw-r--r--Src/Zle/compmatch.c735
-rw-r--r--Src/Zle/computil.c293
-rw-r--r--Src/pattern.c456
-rw-r--r--Src/zsh.h42
6 files changed, 1468 insertions, 287 deletions
diff --git a/Src/Zle/comp.h b/Src/Zle/comp.h
index bcad3df05..7ac051c25 100644
--- a/Src/Zle/comp.h
+++ b/Src/Zle/comp.h
@@ -162,12 +162,49 @@ struct cmatcher {
#define CMF_RIGHT 4
#define CMF_INTER 8
+/*
+ * Types of cpattern structure.
+ * Note freecpattern() assumes any <= CPAT_EQUIV have string.
+ */
+enum {
+ CPAT_CCLASS, /* [...]: ordinary character class */
+ CPAT_NCLASS, /* [!...]: ordinary character class, negated */
+ CPAT_EQUIV, /* {...}: equivalence class */
+ CPAT_ANY, /* ?: any character */
+ CPAT_CHAR /* Single character given explicitly */
+};
+
+/*
+ * A pattern element in a matcher specification.
+ * Unlike normal patterns this only presents one character in
+ * either the test completion or the word on the command line.
+ */
struct cpattern {
Cpattern next; /* next sub-pattern */
- unsigned char tab[256]; /* table of matched characters */
- int equiv; /* if this is a {...} class */
+ int tp; /* type of object as above */
+ union {
+ char *str; /* if a character class, the objects
+ * in it in a similar form to normal
+ * pattern matching (a metafied string
+ * with tokens).
+ * Note the allocated length may be longer
+ * than the null-terminated string.
+ */
+ int chr; /* if a single character, it
+ * TODO: eventually should be a
+ * convchar_t.
+ */
+ } u;
};
+/*
+ * For now this just handles single-byte characters.
+ * TODO: this will change.
+ */
+#define PATMATCHRANGE(r, c, ip, mtp) patmatchrange(r, c, ip, mtp)
+#define PATMATCHINDEX(r, i, cp, mtp) patmatchindex(r, i, cp, mtp)
+#define CONVCAST(c) (c)
+
/* This is a special return value for parse_cmatcher(), *
* signalling an error. */
diff --git a/Src/Zle/complete.c b/Src/Zle/complete.c
index 36c4c8a3d..4b947834b 100644
--- a/Src/Zle/complete.c
+++ b/Src/Zle/complete.c
@@ -122,13 +122,15 @@ freecpattern(Cpattern p)
while (p) {
n = p->next;
+ if (p->tp <= CPAT_EQUIV)
+ free(p->u.str);
zfree(p, sizeof(struct cpattern));
p = n;
}
}
-/* Copy a completion matcher list. */
+/* Copy a completion matcher list into permanent storage. */
/**/
mod_export Cmatcher
@@ -157,22 +159,51 @@ cpcmatcher(Cmatcher m)
return r;
}
+/*
+ * Copy a single entry in a matcher pattern.
+ * If useheap is 1, it comes from the heap.
+ */
+
+/**/
+mod_export Cpattern
+cp_cpattern_element(Cpattern o)
+{
+ Cpattern n = zalloc(sizeof(struct cpattern));
+
+ n->next = NULL;
+
+ n->tp = o->tp;
+ switch (o->tp)
+ {
+ case CPAT_CCLASS:
+ case CPAT_NCLASS:
+ case CPAT_EQUIV:
+ n->u.str = ztrdup(o->u.str);
+ break;
+
+ case CPAT_CHAR:
+ n->u.chr = o->u.chr;
+ break;
+
+ default:
+ /* just to keep compiler quiet */
+ break;
+ }
+
+ return n;
+}
+
/* Copy a completion matcher pattern. */
/**/
static Cpattern
cpcpattern(Cpattern o)
{
- Cpattern r = NULL, *p = &r, n;
+ Cpattern r = NULL, *p = &r;
while (o) {
- *p = n = (Cpattern) zalloc(sizeof(struct cpattern));
-
- n->next = NULL;
- memcpy(n->tab, o->tab, 256);
- n->equiv = o->equiv;
-
- p = &(n->next);
+ *p = cp_cpattern_element(o);
+ p = &((*p)->next);
o = o->next;
}
return r;
@@ -331,14 +362,26 @@ parse_cmatcher(char *name, char *s)
return ret;
}
-/* Parse a pattern for matcher control. */
+/*
+ * Parse a pattern for matcher control.
+ * name is the name of the builtin from which this is called, for errors.
+ * *sp is the input string and will be updated to the end of the parsed
+ * pattern.
+ * *lp will be set to the number of characters (possibly multibyte)
+ * that the pattern will match. This must be deterministic, given
+ * the syntax allowed here.
+ * e, if non-zero, is the ASCII end character to match; if zero,
+ * stop on a blank.
+ * *err is set to 1 to indicate an error, else to 0.
+ */
/**/
static Cpattern
parse_pattern(char *name, char **sp, int *lp, char e, int *err)
{
Cpattern ret = NULL, r = NULL, n;
- unsigned char *s = (unsigned char *) *sp;
+ char *s = *sp;
+ int inchar;
int l = 0;
*err = 0;
@@ -346,25 +389,18 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
while (*s && (e ? (*s != e) : !inblank(*s))) {
n = (Cpattern) hcalloc(sizeof(*n));
n->next = NULL;
- n->equiv = 0;
- if (*s == '[') {
- s = parse_class(n, s + 1, ']');
- if (!*s) {
- *err = 1;
- zwarnnam(name, "unterminated character class");
- return NULL;
- }
- } else if (*s == '{') {
- n->equiv = 1;
- s = parse_class(n, s + 1, '}');
+ if (*s == '[' || *s == '{') {
+ s = parse_class(n, s);
if (!*s) {
*err = 1;
zwarnnam(name, "unterminated character class");
return NULL;
}
+ s++;
} else if (*s == '?') {
- memset(n->tab, 1, 256);
+ n->tp = CPAT_ANY;
+ s++;
} else if (*s == '*' || *s == '(' || *s == ')' || *s == '=') {
*err = 1;
zwarnnam(name, "invalid pattern character `%c'", *s);
@@ -373,8 +409,13 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
if (*s == '\\' && s[1])
s++;
- memset(n->tab, 0, 256);
- n->tab[*s] = 1;
+ if (*s == Meta)
+ inchar = STOUC(*++s) ^ 32;
+ else
+ inchar = STOUC(*s);
+ s++;
+ n->tp = CPAT_CHAR;
+ n->u.chr = inchar;
}
if (ret)
r->next = n;
@@ -384,7 +425,6 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
r = n;
l++;
- s++;
}
*sp = (char *) s;
*lp = l;
@@ -394,28 +434,86 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
/* Parse a character class for matcher control. */
/**/
-static unsigned char *
-parse_class(Cpattern p, unsigned char *s, unsigned char e)
+static char *
+parse_class(Cpattern p, char *iptr)
{
- int n = 0, i = 1, j, eq = (e == '}'), k = 1;
-
- if (!eq && (*s == '!' || *s == '^') && s[1] != e) { n = 1; s++; }
-
- memset(p->tab, n, 256);
-
- n = !n;
- while (*s && (k || *s != e)) {
- if (s[1] == '-' && s[2] && s[2] != e) {
- /* a run of characters */
- for (j = (int) *s; j <= (int) s[2]; j++)
- p->tab[j] = (eq ? i++ : n);
-
- s += 3;
+ int endchar, firsttime = 1;
+ char *optr, *nptr;
+
+ if (*iptr++ == '[') {
+ endchar = ']';
+ /* TODO: surely [^]] is valid? */
+ if ((*iptr == '!' || *iptr == '^') && iptr[1] != ']') {
+ p->tp = CPAT_NCLASS;
+ iptr++;
} else
- p->tab[*s++] = (eq ? i++ : n);
- k = 0;
+ p->tp = CPAT_CCLASS;
+ } else {
+ endchar = '}';
+ p->tp = CPAT_EQUIV;
}
- return s;
+
+ /* find end of class. End character can appear literally first. */
+ for (optr = iptr; optr == iptr || *optr != endchar; optr++)
+ if (!*optr)
+ return optr;
+ /*
+ * We can always fit the parsed class within the same length
+ * because of the tokenization (including a null byte).
+ *
+ * As the input string is metafied, but shouldn't contain shell
+ * tokens, we can just add our own tokens willy nilly.
+ */
+ optr = p->u.str = zalloc((optr-iptr) + 1);
+
+ while (firsttime || *iptr != endchar) {
+ int ch;
+
+ if (*iptr == '[' && iptr[1] == ':' &&
+ (nptr = strchr((char *)iptr + 2, ':')) && nptr[1] == ']') {
+ /* Range type */
+ iptr += 2;
+ ch = range_type((char *)iptr, nptr-iptr);
+ iptr = nptr + 2;
+ if (ch != PP_UNKWN)
+ *optr++ = STOUC(Meta) + ch;
+ } else {
+ /* characters stay metafied */
+ char *ptr1 = iptr;
+ if (*iptr == Meta)
+ iptr++;
+ iptr++;
+ if (*iptr == '-' && iptr[1] && iptr[1] != endchar) {
+ /* a run of characters */
+ iptr++;
+ /* range token */
+ *optr++ = Meta + PP_RANGE;
+
+ /* start of range character */
+ if (*ptr1 == Meta) {
+ *optr++ = Meta;
+ *optr++ = ptr1[1] ^ 32;
+ } else
+ *optr++ = *ptr1;
+
+ if (*iptr == Meta) {
+ *optr++ = *iptr++;
+ *optr++ = *iptr++;
+ } else
+ *optr++ = *iptr++;
+ } else {
+ if (*ptr1 == Meta) {
+ *optr++ = Meta;
+ *optr++ = ptr1[1] ^ 32;
+ } else
+ *optr++ = *ptr1;
+ }
+ }
+ firsttime = 0;
+ }
+
+ *optr = '\0';
+ return iptr;
}
/**/
diff --git a/Src/Zle/compmatch.c b/Src/Zle/compmatch.c
index bf9d2cb0d..e59443c01 100644
--- a/Src/Zle/compmatch.c
+++ b/Src/Zle/compmatch.c
@@ -30,37 +30,68 @@
#include "complete.mdh"
#include "compmatch.pro"
-/* This compares two cpattern lists and returns non-zero if they are
- * equal. */
+/*
+ * This compares two cpattern lists and returns non-zero if they are
+ * equal (N.B. opposite sense to usual *cmp()).
+ *
+ * The old version of this didn't worry about whether the lists
+ * were the same length. This one does. It's hard to see how
+ * that can be wrong even if it's unnecessary.
+ */
/**/
static int
-cmp_cpatterns(Cpattern a, Cpattern b)
+cpatterns_same(Cpattern a, Cpattern b)
{
while (a) {
- if (a->equiv != b->equiv || memcmp(a->tab, b->tab, 256))
+ if (!b)
+ return 0;
+ if (a->tp != b->tp)
return 0;
+ switch (a->tp) {
+ case CPAT_CCLASS:
+ case CPAT_NCLASS:
+ case CPAT_EQUIV:
+ /*
+ * Patterns can actually match the same even if
+ * the range strings don't compare differently, but
+ * I don't think we need to handle that subtlety.
+ */
+ if (strcmp(a->u.str, b->u.str) != 0)
+ return 0;
+ break;
+
+ case CPAT_CHAR:
+ if (a->u.chr != b->u.chr)
+ return 0;
+ break;
+
+ default:
+ /* here to silence compiler */
+ break;
+ }
+
a = a->next;
b = b->next;
}
- return 1;
+ return !b;
}
/* This compares two cmatchers and returns non-zero if they are equal. */
/**/
static int
-cmp_cmatchers(Cmatcher a, Cmatcher b)
+cmatchers_same(Cmatcher a, Cmatcher b)
{
return (a == b ||
(a->flags == b->flags &&
a->llen == b->llen && a->wlen == b->wlen &&
- (!a->llen || cmp_cpatterns(a->line, b->line)) &&
- (a->wlen <= 0 || cmp_cpatterns(a->word, b->word)) &&
+ (!a->llen || cpatterns_same(a->line, b->line)) &&
+ (a->wlen <= 0 || cpatterns_same(a->word, b->word)) &&
(!(a->flags & (CMF_LEFT | CMF_RIGHT)) ||
(a->lalen == b->lalen && a->ralen == b->ralen &&
- (!a->lalen || cmp_cpatterns(a->left, b->left)) &&
- (!a->ralen || cmp_cpatterns(a->right, b->right))))));
+ (!a->lalen || cpatterns_same(a->left, b->left)) &&
+ (!a->ralen || cpatterns_same(a->right, b->right))))));
}
/* Add the given matchers to the bmatcher list. */
@@ -97,7 +128,7 @@ update_bmatchers(void)
t = 0;
for (ms = mstack; ms && !t; ms = ms->next)
for (mp = ms->matcher; mp && !t; mp = mp->next)
- t = cmp_cmatchers(mp, p->matcher);
+ t = cmatchers_same(mp, p->matcher);
p = p->next;
if (!t) {
@@ -449,7 +480,7 @@ add_match_sub(Cmatcher m, char *l, int ll, char *w, int wl)
}
}
-/* This tests if the string from the line l matches the word w. In bp
+/* This tests if the string from the line l matches the word w. In *bpp
* the offset for the brace is returned, in rwlp the length of the
* matched prefix or suffix, not including the stuff before or after
* the last anchor is given. When sfx is non-zero matching is done from
@@ -1113,55 +1144,330 @@ comp_match(char *pfx, char *sfx, char *w, Patprog cp, Cline *clp, int qu,
return r;
}
-/* Check if the given pattern matches the given string. *
+
+/*
+ * Guts of a single pattern for pattern_match().
+ * Return non-zero if match successful.
+ * If the class was an equivalence, return 1 + the index into
+ * the equivalence class (see pattern.c for how this is calculated).
+ */
+
+/**/
+mod_export int
+pattern_match1(Cpattern p, int c, int *mtp)
+{
+ /* TODO: should become convchar_t */
+ int ind;
+
+ *mtp = 0;
+ switch (p->tp) {
+ case CPAT_CCLASS:
+ return PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
+
+ case CPAT_NCLASS:
+ return !PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
+
+ case CPAT_EQUIV:
+ if (PATMATCHRANGE(p->u.str, CONVCAST(c), &ind, mtp))
+ return ind + 1;
+ else
+ return 0;
+
+ case CPAT_ANY:
+ return 1;
+
+ case CPAT_CHAR:
+ return (p->u.chr == c);
+
+ default:
+ DPUTS(1, "bad matcher pattern type");
+ return 0;
+ }
+}
+
+
+/*
+ * Use an equivalence to deduce the line character from the word, or
+ * vice versa. (If vice versa, then "line" and "word" are reversed
+ * in what follows. The logic is symmetric.)
+ * lp is the line pattern.
+ * wind is the index returned by a pattern match on the word pattern,
+ * with type wmtp.
+ * wchr is the word character.
+ * Return -1 if no matching character, else the character.
+ *
+ * Only makes sense if lp->tp == CPAT_EQUIV and the (unseen) word
+ * pattern also has that type.
+ */
+static int
+pattern_match_equivalence(Cpattern lp, int wind, int wmtp, int wchr)
+{
+ int lchr, lmtp;
+
+ if (!PATMATCHINDEX(lp->u.str, wind-1, &lchr, &lmtp)) {
+ /*
+ * No equivalent. No possible match; give up.
+ */
+ return -1;
+ }
+ /*
+ * If we matched an exact character rather than a range
+ * type, return it.
+ */
+ if (lchr != -1)
+ return lchr;
+
+ /*
+ * Check the match types. We may want a case-changed
+ * version of the word character.
+ */
+ if (wmtp == PP_UPPER && lmtp == PP_LOWER)
+ return tulower(wchr);
+ else if (wmtp == PP_LOWER && lmtp == PP_UPPER)
+ return tuupper(wchr);
+ else if (wmtp == lmtp) {
+ /*
+ * Be lenient and allow identical replacements
+ * for character classes, although in fact this
+ * doesn't give special functionality for equivalence
+ * classes.
+ */
+ return wchr;
+ } else {
+ /*
+ * Non-matching generic types; this can't work.
+ */
+ return -1;
+ }
+}
+
+/*
+ * Check if the given pattern matches the given string.
* p and s are either anchor or line pattern and string;
* wp and ws are word (candidate) pattern and string
*
- * If only one pattern is given, we just check if characters match
+ * If only one pattern is given, we just check if characters match.
* If both line and word are given, we check that characters match
- * for {...} classes by comparing relative numbers in sequence.
+ * for {...} classes by comparing positions in the strings.
*
* Patterns and strings are always passed in pairs, so it is enough
* to check for non-NULL wp. p should always be present.
+ *
+ * If prestrict is not NULL, it is a chain of patterns at least as long
+ * as the line string. In this case we are still assembling the line at
+ * s (which has been allocated but doesn't yet contain anything useful)
+ * and must continue to do so as we go along; prestrict gives
+ * restrictions on the line character to be applied along side the other
+ * patterns. In the simple case a restriction is a character to be put
+ * in place; otherwise it is a set of possible characters and we have to
+ * deduce an actual matching character. Note prestrict is never an
+ * equivalence class. In extreme cases we can't deduce a unique
+ * character; then the match fails.
*/
/**/
mod_export int
-pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
+pattern_match_restrict(Cpattern p, char *s, Cpattern wp, char *ws,
+ Cpattern prestrict)
{
- unsigned char c;
- unsigned char wc;
+ int c, ind;
+ int wc, wind;
+ int len, wlen, mt, wmt;
while (p && wp && *s && *ws) {
- c = p->tab[*((unsigned char *) s)];
- wc = wp->tab[*((unsigned char *) ws)];
-
- if (!c || !wc || c != wc)
+ /* First test the word character */
+ if (*ws == Meta) {
+ wc = STOUC(ws[1]) ^ 32;
+ wlen = 2;
+ } else {
+ wc = STOUC(*ws);
+ wlen = 1;
+ }
+ wind = pattern_match1(wp, wc, &wmt);
+ if (!wind)
return 0;
- s++;
- ws++;
+ /*
+ * Now the line character; deal with the case where
+ * we don't yet have it, only a restriction on it.
+ */
+ if (prestrict) {
+ if (prestrict->tp == CPAT_CHAR) {
+ /*
+ * Easy case: restricted to an exact character on
+ * the line. Procede as normal.
+ */
+ c = prestrict->u.chr;
+ } else {
+ if (p->tp == CPAT_CHAR) {
+ /*
+ * Normal line pattern is an exact character: as
+ * long as this matches prestrict, we can proceed
+ * as usual.
+ */
+ c = p->u.chr;
+ } else if (p->tp == CPAT_EQUIV) {
+ /*
+ * An equivalence, so we can deduce the character
+ * backwards from the word pattern and see if it
+ * matches prestrict.
+ */
+ if ((c = pattern_match_equivalence(p, wind, wmt, wc)) == -1)
+ return 0;
+ } else {
+ /*
+ * Not an equivalence, so that means we must match
+ * the word (not just the word pattern), so grab it
+ * and make sure it fulfills our needs. I think.
+ * Not 100% sure about that, but what else can
+ * we do? We haven't actually been passed a string
+ * from the command line.
+ */
+ c = wc;
+ }
+ /* Character so deduced must match the restriction. */
+ if (!pattern_match1(prestrict, c, &mt))
+ return 0;
+ }
+ len = imeta(c) ? 2 : 1;
+ } else {
+ /* We have the character itself. */
+ if (*s == Meta) {
+ c = STOUC(s[1]) ^ 32;
+ len = 2;
+ } else {
+ c = STOUC(*s);
+ len = 1;
+ }
+ }
+ /*
+ * If either is "?", they match each other; no further tests.
+ * Apply this even if the character wasn't convertable;
+ * there's no point trying to be clever in that case.
+ */
+ if (p->tp != CPAT_ANY || wp->tp != CPAT_ANY)
+ {
+ ind = pattern_match1(p, c, &mt);
+ if (!ind)
+ return 0;
+ if (ind != wind)
+ return 0;
+ if (mt != wmt) {
+ /*
+ * Special case if matching lower vs. upper or
+ * vice versa. The transformed characters must match.
+ * We don't need to check the transformation is
+ * the appropriate one for each character separately,
+ * since that was done in pattern_match1(), so just
+ * compare lower-cased versions of both.
+ */
+ if ((mt == PP_LOWER || mt == PP_UPPER) &&
+ (wmt == PP_LOWER || wmt == PP_UPPER)) {
+ if (tulower(c) != tulower(wc))
+ return 0;
+ } else {
+ /* Other different classes can't match. */
+ return 0;
+ }
+ }
+ }
+
+ if (prestrict) {
+ /* We need to assemble the line */
+ if (imeta(c)) {
+ *s++ = Meta;
+ *s++ = c ^ 32;
+ } else {
+ *s++ = c;
+ }
+ prestrict = prestrict->next;
+ } else
+ s += len;
+ ws += wlen;
p = p->next;
wp = wp->next;
}
while (p && *s) {
- if (!p->tab[*((unsigned char *) s)])
+ if (prestrict) {
+ /*
+ * As above, but with even less info to go on.
+ * (Can this happen?) At least handle the cases where
+ * one of our patterns has given us a specific character.
+ */
+ if (prestrict->tp == CPAT_CHAR) {
+ c = prestrict->u.chr;
+ } else {
+ if (p->tp == CPAT_CHAR) {
+ c = p->u.chr;
+ } else {
+ /*
+ * OK. Here we are in a function with just a line
+ * pattern and another pattern to restrict the
+ * characters that can go on the line, and no actual
+ * characters. We're matching two patterns against
+ * one another to generate a character to insert.
+ * This is a bit too psychedelic, so I'm going to
+ * bale out now. See you on the ground.
+ */
+ return 0;
+ }
+ if (!pattern_match1(prestrict, c, &mt))
+ return 0;
+ }
+ } else {
+ if (*s == Meta) {
+ c = STOUC(s[1]) ^ 32;
+ len = 2;
+ } else {
+ c = STOUC(*s);
+ len = 1;
+ }
+ }
+ if (!pattern_match1(p, c, &mt))
return 0;
p = p->next;
- s++;
+ if (prestrict) {
+ if (imeta(c)) {
+ *s++ = Meta;
+ *s++ = c ^ 32;
+ } else {
+ *s++ = c;
+ }
+ prestrict = prestrict->next;
+ } else
+ s += len;
}
while (wp && *ws) {
- if (!wp->tab[*((unsigned char *) ws)])
+ /* No funny business when we only have the word pattern. */
+ if (*ws == Meta) {
+ wc = STOUC(ws[1]) ^ 32;
+ wlen = 2;
+ } else {
+ wc = STOUC(*ws);
+ wlen = 1;
+ }
+ if (!pattern_match1(wp, wc, &wmt))
return 0;
wp = wp->next;
- ws++;
+ ws += wlen;
}
return 1;
}
+/*
+ * The usual version of pattern matching, without the line string
+ * being handled by restriction.
+ */
+/**/
+mod_export int
+pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
+{
+ return pattern_match_restrict(p, s, wp, ws, NULL);
+}
+
/* This splits the given string into a list of cline structs, separated
* at those places where one of the anchors of an `*' pattern was found.
* plen gives the number of characters on the line that matched this
@@ -1256,11 +1562,11 @@ bld_parts(char *str, int len, int plen, Cline *lp, Cline *lprem)
return ret;
}
-/* This builds all the possible line patterns for the pattern pat in the
- * buffer line. Initially line is the same as lp, but during recursive
- * calls lp is incremented for storing successive characters. Whenever
- * a full possible string is build, we test if this line matches the
- * string given by wlen and word.
+
+/*
+ * This builds all the possible line patterns for the pattern pat in the
+ * buffer line. Then we test if this line matches the string given by
+ * wlen and word.
*
* wpat contains pattern that matched previously
* lpat contains the pattern for line we build
@@ -1269,91 +1575,297 @@ bld_parts(char *str, int len, int plen, Cline *lp, Cline *lprem)
*
* The return value is the length of the string matched in the word, it
* is zero if we couldn't build a line that matches the word.
+ *
+ * TODO: a lot of the nastiness associated with variable string
+ * lengths can go when we switch to wide characters. (Why didn't
+ * I just keep line unmetafied and metafy into place at the end? Er...)
*/
-
/**/
static int
-bld_line(Cpattern wpat, Cpattern lpat, char *line, char *lp,
- char *mword, char *word, int wlen, int sfx)
+bld_line(Cmatcher mp, char **linep, char *mword, char *word, int wlen, int sfx)
{
- if (lpat) {
- /* Still working on the pattern. */
-
- int i, l;
- unsigned char c = 0;
-
- /* Get the number of the character for a correspondence class
- * if it has a corresponding class. */
- if (lpat->equiv)
- if (wpat && *mword) {
- c = wpat->tab[STOUC(*mword)];
- wpat = wpat->next;
- mword++;
+ Cpattern lpat = mp->line;
+ Cpattern wpat = mp->word;
+ Cpattern curgenpat;
+ VARARR(struct cpattern, genpatarr, mp->llen);
+ Cmlist ms;
+ int llen, rl;
+ char *oword = word, *line = *linep;
+
+ /*
+ * Loop over all characters. At this stage, line is an empty
+ * space of length llen (not counting the null byte) which we assemble as
+ * we go along.
+ *
+ * However, first we need to know what characters can appear at each
+ * point in the line. For this we assemble an list genpatarr of the
+ * same length as the line. (It's convenient to store this as an
+ * array but it's linked as a list, too.) If there are equivalences
+ * we use mword to derive the equivalent character; when we've
+ * reached the end of mword, equivalences are treated just like
+ * ordinary character classes. For character classes we just attach
+ * the class to the genpatarr list and apply it as a restriction
+ * when we finally match the line against the set of matchers.
+ */
+ curgenpat = genpatarr;
+ while (lpat) {
+ int wchr = (*mword == Meta) ? STOUC(mword[1]) ^ 32 : STOUC(*mword);
+ int wmtp, wind;
+ /*
+ * If the line pattern is an equivalence, query wpat to find the
+ * word part of the equivalence. If we don't find one we don't try
+ * equivalencing but use lpat as an ordinary match. (It's not
+ * entirely clear to me this is the correct behaviour on a
+ * failed character match within the equivalence, but that was
+ * the behaviour of the old logic that this replaces.)
+ */
+ if (lpat->tp == CPAT_EQUIV && wpat && *mword) {
+ wind = pattern_match1(wpat, wchr, &wmtp);
+ wpat = wpat->next;
+ mword += (*mword == Meta) ? 2 : 1;
+ } else
+ wind = 0;
+ if (wind) {
+ /*
+ * Successful match for word side of equivalence.
+ * Find the line equivalent.
+ */
+ int lchr;
+ if ((lchr = pattern_match_equivalence(lpat, wind, wmtp, wchr))
+ == -1) {
+ /*
+ * No equivalent. No possible match; give up.
+ */
+ return 0;
+ }
+ /*
+ * We now have an exact character to match,
+ * so make up a pattern element for it.
+ */
+ curgenpat->tp = CPAT_CHAR;
+ curgenpat->u.chr = lchr;
+ } else {
+ /*
+ * Not an equivalence class, so we just keep the
+ * test in the lpat as it is.
+ */
+ curgenpat->tp = lpat->tp;
+ if (lpat->tp == CPAT_CHAR)
+ curgenpat->u.chr = lpat->u.chr;
+ else if (lpat->tp != CPAT_ANY) {
+ /*
+ * The string isn't modified and is only needed in calls from
+ * this function, so we don't even need to copy it.
+ */
+ curgenpat->u.str = lpat->u.str;
}
+ }
+ lpat = lpat->next;
+ /*
+ * This linked list is defined above as an array.
+ * We could get away with just keeping it as an array
+ * and passing it down as such, but that's a bit icky
+ * since the generic linkage of Cpatterns is as a linked
+ * list and we should keep our local memory management
+ * problems to ourselvess.
+ */
+ if (lpat)
+ curgenpat->next = curgenpat+1;
+ else
+ curgenpat->next = NULL;
+ curgenpat++;
+ }
+ /*
+ * We now know how to match the word with the line patterns; let's
+ * see if it does. We will use the information in curgenpat if we
+ * are successful to work out what character goes on the line. This
+ * is a bit hairy, as in "the Yeti is a creature that is a bit
+ * hairy".
+ */
+ llen = mp->llen;
+ rl = 0;
- /* Walk through the table in the pattern and try the characters
- * that may appear in the current position. */
- for (i = 0; i < 256; i++)
- if ((lpat->equiv && c) ? (c == lpat->tab[i]) : lpat->tab[i]) {
- *lp = i;
- /* We stored the character, now call ourselves to build
- * the rest. */
- if ((l = bld_line(wpat, lpat->next, line, lp + 1,
- mword, word, wlen, sfx)))
- return l;
- }
- } else {
- /* We reached the end, i.e. the line string is fully build, now
- * see if it matches the given word. */
+ if (sfx)
+ {
+ /*
+ * We need to work backwards from the end of both the
+ * word and the line strings.
+ *
+ * Position at the end of the word by counting characters.
+ */
+ int l = wlen;
+ while (l--)
+ word += (*word == Meta) ? 2 : 1;
- Cmlist ms;
- Cmatcher mp;
- int l = lp - line, t, rl = 0, ind, add;
+ /*
+ * We construct the line from the end. We've left
+ * enough space for possible Meta's.
+ */
+ line += 2 * llen;
+ *line = '\0';
+ curgenpat = genpatarr + llen;
+ } else
+ curgenpat = genpatarr;
- /* Quick test if the strings are exactly the same. */
- if (l == wlen && !strncmp(line, word, l))
- return l;
+ /* we now reuse mp, lpat, wpat for the global matchers */
+ while (llen && wlen) {
+ int wchr, wmtp;
+ char *wp;
+ Cpattern tmpgenpat;
if (sfx) {
- line = lp; word += wlen;
- ind = -1; add = -1;
- } else {
- ind = 0; add = 1;
- }
- /* We loop through the whole line string built. */
- while (l && wlen) {
- if (word[ind] == line[ind]) {
- /* The same character in both strings, skip over. */
- line += add; word += add;
- l--; wlen--; rl++;
+ if (word > oword + 1 && word[-2] == Meta)
+ wp = word - 2;
+ else
+ wp = word - 1;
+ curgenpat--;
+ } else
+ wp = word;
+ if (*wp == Meta)
+ wchr = STOUC(wp[1]) ^ 32;
+ else
+ wchr = STOUC(*wp);
+ if (pattern_match1(curgenpat, wchr, &wmtp))
+ {
+ int lchr;
+ /*
+ * We can match the line character directly with the word
+ * character. If the line character is a fixed one,
+ * keep it, since we went to all that trouble above,
+ * else if it's generic, keep the word character,
+ * since we have no choice.
+ */
+ if (curgenpat->tp == CPAT_CHAR)
+ lchr = curgenpat->u.chr;
+ else
+ lchr = wchr;
+ if (imeta(lchr)) {
+ if (sfx)
+ line -= 2;
+ line[0] = Meta;
+ line[1] = lchr ^ 32;
+ if (!sfx)
+ line += 2;
} else {
- t = 0;
- for (ms = bmatchers; ms && !t; ms = ms->next) {
- mp = ms->matcher;
- if (mp && !mp->flags && mp->wlen <= wlen && mp->llen <= l &&
- pattern_match(mp->line, (sfx ? line - mp->llen : line),
- mp->word, (sfx ? word - mp->wlen : word))) {
- /* Both the line and the word pattern matched,
- * now skip over the matched portions. */
+ if (sfx)
+ line--;
+ line[0] = lchr;
+ if (!sfx)
+ line++;
+ }
+
+ llen--;
+ wlen--;
+ rl++;
+
+ if (sfx)
+ word = wp;
+ else {
+ if (llen)
+ curgenpat++;
+ word += (*word == Meta) ? 2 : 1;
+ }
+ }
+ else
+ {
+ char *lp;
+ /*
+ * Need to loop over pattern matchers.
+ */
+ for (ms = bmatchers; ms; ms = ms->next) {
+ mp = ms->matcher;
+ /*
+ * This is the nightmare case: we have line and
+ * and word matchers and some pattern which restricts
+ * the value on the line without us knowing exactly
+ * what it is. Despatch to the special function
+ * for that.
+ */
+ if (mp && !mp->flags && mp->wlen <= wlen &&
+ mp->llen <= llen)
+ {
+ if (sfx) {
+ /*
+ * We haven't assembled the line yet, and with
+ * Meta characters we don't yet know the length.
+ * We'll fix this up later.
+ */
+ lp = line - 2 * mp->llen;
+ } else
+ lp = line;
+ wp = word;
+ if (sfx) {
+ int l = mp->wlen;
+ while (l--) {
+ if (wp > oword + 1 && wp[-2] == Meta)
+ wp -= 2;
+ else
+ wp--;
+ }
+
+ tmpgenpat = curgenpat - mp->llen;
+ } else
+ tmpgenpat = curgenpat;
+ if (pattern_match_restrict(mp->line, lp,
+ mp->word, wp, tmpgenpat)) {
+ /*
+ * Matched: advance over as many characters
+ * of the patterns and strings as
+ * we've done matches.
+ */
if (sfx) {
- line -= mp->llen; word -= mp->wlen;
+ int imove = mp->llen, nchar;
+ char *pmove = lp;
+ word = wp;
+
+ /* Close the gap we left in the line string */
+ while (imove--)
+ pmove += (*pmove == Meta) ? 2 : 1;
+ /* Number of bytes to move */
+ nchar = (int)(pmove - lp);
+ /* The size of the gap */
+ imove = 2 * mp->llen - nchar;
+ if (imove) {
+ lp = line - imove;
+ /* Moving up, so start at the top */
+ while (nchar--)
+ *--line = *--lp;
+ /* line is at the start of the moved text */
+ }
+
+ curgenpat = tmpgenpat;
} else {
- line += mp->llen; word += mp->wlen;
+ int cnt = mp->llen;
+ while (cnt--) {
+ line += (*line == Meta) ? 2 : 1;
+ }
+
+ cnt = mp->wlen;
+ while (cnt--)
+ word += (*word == Meta) ? 2 : 1;
+
+ curgenpat += mp->llen;
}
- l -= mp->llen; wlen -= mp->wlen; rl += mp->wlen;
- t = 1;
+ llen -= mp->llen;
+ wlen -= mp->wlen;
+ rl += mp->wlen;
+ break;
}
}
- if (!t)
- /* Didn't match, give up. */
- return 0;
}
+ if (!ms)
+ return 0; /* Didn't match, give up */
}
- if (!l)
- /* Unmatched portion in the line built, return matched length. */
- return rl;
+ }
+ if (!llen) {
+ /* Unmatched portion in the line built, return matched length. */
+ if (sfx)
+ *linep = line;
+ else
+ *line = '\0';
+ return rl;
}
return 0;
}
@@ -1386,8 +1898,10 @@ join_strs(int la, char *sa, int lb, char *sb)
if ((t = pattern_match(mp->word, sa, NULL, NULL)) ||
pattern_match(mp->word, sb, NULL, NULL)) {
/* It matched one of the strings, t says which one. */
- VARARR(char, line, mp->llen + 1);
- char **ap, **bp;
+ /* TODO: double to allow Meta, not necessary
+ when properly unmetafied */
+ VARARR(char, linearr, 2*mp->llen + 1);
+ char **ap, **bp, *line = linearr;
int *alp, *blp;
if (t) {
@@ -1399,10 +1913,8 @@ join_strs(int la, char *sa, int lb, char *sb)
}
/* Now try to build a string that matches the other
* string. */
- if ((bl = bld_line(mp->word, mp->line, line, line,
- *ap, *bp, *blp, 0))) {
+ if ((bl = bld_line(mp, &line, *ap, *bp, *blp, 0))) {
/* Found one, put it into the return string. */
- line[mp->llen] = '\0';
if (rr <= mp->llen) {
char *or = rs;
@@ -1444,7 +1956,11 @@ join_strs(int la, char *sa, int lb, char *sb)
return rs;
}
-/* This compares the anchors stored in two top-level clines. */
+/*
+ * This compares the anchors stored in two top-level clines.
+ * It returns 1 if the anchors are the same, 2 if they are
+ * compatible (and have been combined in "o"), 0 otherwise.
+ */
/**/
static int
@@ -1591,9 +2107,11 @@ join_sub(Cmdata md, char *str, int len, int *mlen, int sfx, int join)
NULL, NULL)) ||
pattern_match(mp->word, nw - (sfx ? mp->wlen : 0),
NULL, NULL))) {
- VARARR(char, line, mp->llen + 1);
+ /* TODO: doubled to allow Meta, not necessary
+ * when properly unmetafied */
+ VARARR(char, linearr, 2*mp->llen + 1);
int bl;
- char *mw;
+ char *mw, *line = linearr;
/* Then build all the possible lines and see
* if one of them matches the other string. */
@@ -1602,11 +2120,10 @@ join_sub(Cmdata md, char *str, int len, int *mlen, int sfx, int join)
else
mw = nw - (sfx ? mp->wlen : 0);
- if ((bl = bld_line(mp->word, mp->line, line, line,
- mw, (t ? nw : ow), (t ? nl : ol), sfx))) {
+ if ((bl = bld_line(mp, &line, mw, (t ? nw : ow),
+ (t ? nl : ol), sfx))) {
/* Yep, one of the lines matched the other
* string. */
- line[mp->llen] = '\0';
if (t) {
ol = mp->wlen; nl = bl;
diff --git a/Src/Zle/computil.c b/Src/Zle/computil.c
index 9d116b93a..2b3efa776 100644
--- a/Src/Zle/computil.c
+++ b/Src/Zle/computil.c
@@ -3997,6 +3997,239 @@ cfp_test_exact(LinkList names, char **accept, char *skipped)
return (found ? ret : NULL);
}
+
+/*
+ * This code constructs (from heap) and returns a string that
+ * corresponds to a series of matches; when compiled as a pattern, at
+ * each position it matches either the character from the string "add"
+ * or the corresponding single-character match from the set of matchers.
+ * To take a simple case, if add is "a" and the single matcher for the
+ * character position matches "[0-9]", the pattern returned is "[0-9a]".
+ * We take account of equivalences between the word and line, too.
+ *
+ * As there are virtually no comments in this file, I don't really
+ * know why we're doing this, but it's to do with a matcher which
+ * is passed as an argument to the utility compfiles -p/-P.
+ */
+static char *
+cfp_matcher_range(Cmatcher *ms, char *add)
+{
+ Cmatcher *mp, m;
+ int len = 0, mt;
+ char *ret = NULL, *p = NULL, *adds = add;
+
+ /*
+ * Do this twice: once to work out the length of the
+ * string in len, the second time to build it in ret.
+ * This is probably worthwhile because otherwise memory
+ * management is difficult.
+ */
+ for (;;) {
+ for (mp = ms; *add; add++, mp++) {
+ if (!(m = *mp)) {
+ /*
+ * No matcher, so just match the character
+ * itself.
+ *
+ * TODO: surely this needs quoting if it's a
+ * metacharacter?
+ */
+ if (ret) {
+ if (imeta(*add)) {
+ *p++ = Meta;
+ *p++ = *add ^ 32;
+ } else
+ *p++ = *add;
+ } else
+ len += imeta(*add) ? 2 : 1;
+ } else if (m->flags & CMF_RIGHT) {
+ /*
+ * Right-anchored: match anything followed
+ * by the character itself.
+ */
+ if (ret) {
+ *p++ = '*';
+ /* TODO: quote again? */
+ if (imeta(*add)) {
+ *p++ = Meta;
+ *p++ = *add ^ 32;
+ } else
+ *p++ = *add;
+ } else
+ len += imeta(*add) ? 3 : 2;
+ } else {
+ /* The usual set of matcher possibilities. */
+ int ind;
+ if (m->line->tp == CPAT_EQUIV &&
+ m->word->tp == CPAT_EQUIV) {
+ /*
+ * Genuine equivalence. Add the character to match
+ * and the equivalent character from the word
+ * pattern.
+ *
+ * TODO: we could be more careful here with special
+ * cases as we are in the basic character class
+ * code below.
+ */
+ if (ret) {
+ *p++ = '[';
+ if (imeta(*add)) {
+ *p++ = Meta;
+ *p++ = *add ^ 32;
+ } else
+ *p++ = *add;
+ } else
+ len += imeta(*add) ? 3 : 2;
+ if (PATMATCHRANGE(m->line->u.str, CONVCAST(*add),
+ &ind, &mt)) {
+ /*
+ * Find the equivalent match for ind in the
+ * word pattern.
+ */
+ if ((ind = pattern_match_equivalence
+ (m->word, ind, mt, CONVCAST(*add))) != -1) {
+ if (ret) {
+ if (imeta(ind)) {
+ *p++ = Meta;
+ *p++ = ind ^ 32;
+ } else
+ *p++ = ind;
+ } else
+ len += imeta(ind) ? 2 : 1;
+ }
+ }
+ if (ret)
+ *p++ = ']';
+ else
+ len++;
+ } else {
+ int newlen, addadd;
+
+ switch (m->word->tp) {
+ case CPAT_NCLASS:
+ /*
+ * TODO: the old logic implies that we need to
+ * match *add, i.e. it should be deleted from
+ * the set of character's we're not allowed to
+ * match. That's too much like hard work for
+ * now. Indeed, in general it's impossible
+ * without trickery. Consider *add == 'A',
+ * range == "[^[:upper:]]": we would have to
+ * resort to something like "(A|[^[:upper:]])";
+ * and in an expression like that *add may or
+ * may not need backslashing. So we're deep
+ * into see-if-we-can-get-away-without
+ * territory.
+ */
+ if (ret) {
+ *p++ = '[';
+ *p++ = '^';
+ } else
+ len += 2;
+ /*
+ * Convert the compiled range string back
+ * to an ordinary string.
+ */
+ newlen =
+ pattern_range_to_string(m->word->u.str, p);
+ DPUTS(!newlen, "empty character range");
+ if (ret) {
+ p += newlen;
+ *p++ = ']';
+ } else
+ len += newlen + 1;
+ break;
+
+ case CPAT_CCLASS:
+ /*
+ * If there is an equivalence only on one
+ * side it's not equivalent to anything.
+ * Treat it as an ordinary character class.
+ */
+ case CPAT_EQUIV:
+ case CPAT_CHAR:
+ if (ret)
+ *p++ = '[';
+ else
+ len++;
+ /*
+ * We needed to add *add specially only if
+ * it is not covered by the range. This
+ * is necessary for correct syntax---consider
+ * if *add is ] and ] is also the first
+ * character in the range.
+ */
+ addadd = !pattern_match1(m->word, CONVCAST(*add), &mt);
+ if (addadd && *add == ']') {
+ if (ret)
+ *p++ = *add;
+ else
+ len++;
+ }
+ if (m->word->tp == CPAT_CHAR) {
+ /*
+ * The matcher just matches a single
+ * character, but we need to be able
+ * to match *add, too, hence we do
+ * this as a [...].
+ */
+ if (ret) {
+ if (imeta(m->word->u.chr)) {
+ *p++ = Meta;
+ *p++ = m->word->u.chr ^ 32;
+ } else
+ *p++ = m->word->u.chr;
+ } else
+ len += imeta(m->word->u.chr) ? 2 : 1;
+ } else {
+ /*
+ * Convert the compiled range string back
+ * to an ordinary string.
+ */
+ newlen =
+ pattern_range_to_string(m->word->u.str, p);
+ DPUTS(!newlen, "empty character range");
+ if (ret)
+ p += newlen;
+ else
+ len += newlen;
+ }
+ if (addadd && *add != ']') {
+ if (ret) {
+ if (imeta(*add)) {
+ *p++ = Meta;
+ *p++ = *add ^ 32;
+ } else
+ *p++ = *add;
+ } else
+ len += imeta(*add) ? 2 : 1;
+ }
+ if (ret)
+ *p++ = ']';
+ else
+ len++;
+ break;
+
+ case CPAT_ANY:
+ if (ret)
+ *p++ = '?';
+ else
+ len++;
+ break;
+ }
+ }
+ }
+ }
+ if (ret) {
+ *p = '\0';
+ return ret;
+ }
+ p = ret = zhalloc(len + 1);
+ add = adds;
+ }
+}
+
+
static char *
cfp_matcher_pats(char *matcher, char *add)
{
@@ -4064,64 +4297,8 @@ cfp_matcher_pats(char *matcher, char *add)
break;
}
}
- if (*add) {
- char *ret = "", buf[259];
-
- for (mp = ms; *add; add++, mp++) {
- if (!(m = *mp)) {
- buf[0] = *add;
- buf[1] = '\0';
- } else if (m->flags & CMF_RIGHT) {
- buf[0] = '*';
- buf[1] = *add;
- buf[2] = '\0';
- } else {
- unsigned char *t, c;
- char *p = buf;
- int i;
-
- for (i = 256, t = m->word->tab; i--; t++)
- if (*t)
- break;
- if (i) {
- t = m->word->tab;
- *p++ = '[';
- if (m->line->equiv && m->word->equiv) {
- *p++ = *add;
- c = m->line->tab[STOUC(*add)];
- for (i = 0; i < 256; i++)
- if (m->word->tab[i] == c) {
- *p++ = (char) i;
- break;
- }
- } else {
- if (*add == ']' || t[STOUC(']')])
- *p++ = ']';
- for (i = 0; i < 256; i++, t++)
- if (*t && ((char) i) != *add &&
- i != ']' && i != '-' &&
- i != '^' && i != '!')
- *p++ = (char) i;
- *p++ = *add;
- t = m->word->tab;
- if (*add != '^' && t[STOUC('^')])
- *p++ = '^';
- if (*add != '!' && t[STOUC('!')])
- *p++ = '!';
- if (*add != '-' && t[STOUC('-')])
- *p++ = '-';
- }
- *p++ = ']';
- *p = '\0';
- } else {
- *p = '?';
- p[1] = '\0';
- }
- }
- ret = dyncat(ret, buf);
- }
- return ret;
- }
+ if (*add)
+ return cfp_matcher_range(ms, add);
}
return add;
}
diff --git a/Src/pattern.c b/Src/pattern.c
index f7ef7774e..81e4bce8b 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -193,25 +193,6 @@ typedef union upat *Upat;
* v v ^
* ------------------------
*/
-#define PP_ALPHA 1
-#define PP_ALNUM 2
-#define PP_ASCII 3
-#define PP_BLANK 4
-#define PP_CNTRL 5
-#define PP_DIGIT 6
-#define PP_GRAPH 7
-#define PP_LOWER 8
-#define PP_PRINT 9
-#define PP_PUNCT 10
-#define PP_SPACE 11
-#define PP_UPPER 12
-#define PP_XDIGIT 13
-#define PP_IDENT 14
-#define PP_IFS 15
-#define PP_IFSSPACE 16
-#define PP_WORD 17
-#define PP_UNKWN 18
-#define PP_RANGE 19
#define P_OP(p) ((p)->l & 0xff)
#define P_NEXT(p) ((p)->l >> 8)
@@ -1057,6 +1038,127 @@ patgetglobflags(char **strp, long *assertp, int *ignore)
return 1;
}
+
+static const char *colon_stuffs[] = {
+ "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph",
+ "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
+ "IFS", "IFSSPACE", "WORD", NULL
+};
+
+/*
+ * Handle the guts of a [:stuff:] character class element.
+ * start is the beginning of "stuff" and len is its length.
+ * This code is exported for the benefit of completion matching.
+ */
+
+/**/
+mod_export int
+range_type(char *start, int len)
+{
+ const char **csp;
+
+ for (csp = colon_stuffs; *csp; csp++) {
+ if (!strncmp(start, *csp, len))
+ return (csp - colon_stuffs) + PP_FIRST;
+ }
+
+ return PP_UNKWN;
+}
+
+
+/*
+ * Convert the contents of a [...] or [^...] expression (just the
+ * ... part) back into a string. This is used by compfiles -p/-P
+ * for some reason. The compiled form (a metafied string) is
+ * passed in rangestr.
+ *
+ * If outstr is non-NULL the compiled form is placed there. It
+ * must be sufficiently long. A terminating NULL is appended.
+ *
+ * Return the length required, not including the terminating NULL.
+ *
+ * TODO: this is non-multibyte for now. It will need to be defined
+ * appropriately with MULTIBYTE_SUPPORT when the completion matching
+ * code catches up.
+ */
+
+/**/
+mod_export int
+pattern_range_to_string(char *rangestr, char *outstr)
+{
+ int len = 0;
+
+ while (*rangestr) {
+ if (imeta(STOUC(*rangestr))) {
+ int swtype = STOUC(*rangestr) - STOUC(Meta);
+
+ if (swtype == 0) {
+ /* Ordindary metafied character */
+ if (outstr)
+ {
+ *outstr++ = Meta;
+ *outstr++ = rangestr[1] ^ 32;
+ }
+ len += 2;
+ rangestr += 2;
+ } else if (swtype == PP_RANGE) {
+ /* X-Y range */
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ if (*rangestr == Meta) {
+ if (outstr) {
+ *outstr++ = Meta;
+ *outstr++ = rangestr[1];
+ }
+ len += 2;
+ rangestr += 2;
+ } else {
+ if (outstr)
+ *outstr++ = *rangestr;
+ len++;
+ rangestr++;
+ }
+
+ if (i == 0) {
+ if (outstr)
+ *outstr++ = '-';
+ len++;
+ }
+ }
+ } else if (swtype >= PP_FIRST && swtype <= PP_LAST) {
+ /* [:stuff:]; we need to output [: and :] */
+ const char *found = colon_stuffs[swtype - PP_FIRST];
+ int newlen = strlen(found);
+ if (outstr) {
+ strcpy(outstr, "[:");
+ outstr += 2;
+ memcpy(outstr, found, newlen);
+ outstr += newlen;
+ strcpy(outstr, ":]");
+ outstr += 2;
+ }
+ len += newlen + 4;
+ rangestr++;
+ } else {
+ /* shouldn't happen */
+ DPUTS(1, "BUG: unknown PP_ code in pattern range");
+ rangestr++;
+ }
+ } else {
+ /* ordinary character, guaranteed no Meta handling needed */
+ if (outstr)
+ *outstr++ = *rangestr;
+ len++;
+ rangestr++;
+ }
+ }
+
+ if (outstr)
+ *outstr = '\0';
+ return len;
+}
+
/*
* compile a chunk such as a literal string or a [...] followed
* by a possible hash operator
@@ -1230,45 +1332,10 @@ patcomppiece(int *flagp)
/* Posix range. */
patparse += 2;
len = nptr - patparse;
- if (!strncmp(patparse, "alpha", len))
- ch = PP_ALPHA;
- else if (!strncmp(patparse, "alnum", len))
- ch = PP_ALNUM;
- else if (!strncmp(patparse, "ascii", len))
- ch = PP_ASCII;
- else if (!strncmp(patparse, "blank", len))
- ch = PP_BLANK;
- else if (!strncmp(patparse, "cntrl", len))
- ch = PP_CNTRL;
- else if (!strncmp(patparse, "digit", len))
- ch = PP_DIGIT;
- else if (!strncmp(patparse, "graph", len))
- ch = PP_GRAPH;
- else if (!strncmp(patparse, "lower", len))
- ch = PP_LOWER;
- else if (!strncmp(patparse, "print", len))
- ch = PP_PRINT;
- else if (!strncmp(patparse, "punct", len))
- ch = PP_PUNCT;
- else if (!strncmp(patparse, "space", len))
- ch = PP_SPACE;
- else if (!strncmp(patparse, "upper", len))
- ch = PP_UPPER;
- else if (!strncmp(patparse, "xdigit", len))
- ch = PP_XDIGIT;
- else if (!strncmp(patparse, "IDENT", len))
- ch = PP_IDENT;
- else if (!strncmp(patparse, "IFS", len))
- ch = PP_IFS;
- else if (!strncmp(patparse, "IFSSPACE", len))
- ch = PP_IFSSPACE;
- else if (!strncmp(patparse, "WORD", len))
- ch = PP_WORD;
- else
- ch = PP_UNKWN;
+ ch = range_type(patparse, len);
patparse = nptr + 2;
if (ch != PP_UNKWN)
- patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
+ patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN);
continue;
}
charstart = patparse;
@@ -1276,7 +1343,7 @@ patcomppiece(int *flagp)
if (*patparse == '-' && patparse[1] &&
patparse[1] != Outbrack) {
- patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
+ patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
if (itok(*charstart)) {
patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
PA_NOALIGN);
@@ -2369,19 +2436,19 @@ patmatch(Upat prog)
wchar_t cr = CHARREF(patinput, patinend);
char *scanop = (char *)P_OPERAND(scan);
if (patglobflags & GF_MULTIBYTE) {
- if (mb_patmatchrange(scanop, cr) ^
+ if (mb_patmatchrange(scanop, cr, NULL, NULL) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
CHARINC(patinput, patinend);
- } else if (patmatchrange(scanop, (int)cr) ^
+ } else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
CHARINC(patinput, patinend);
#else
if (patmatchrange((char *)P_OPERAND(scan),
- CHARREF(patinput, patinend)) ^
+ CHARREF(patinput, patinend), NULL, NULL) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
@@ -3122,12 +3189,33 @@ patmatch(Upat prog)
/**/
#ifdef MULTIBYTE_SUPPORT
+/*
+ * See if character ch matches a pattern range specification.
+ * The null-terminated specification is in range; the test
+ * character is in ch.
+ *
+ * indptr is used by completion matching, which is why this
+ * function is exported. If indptr is not NULL we set *indptr
+ * to the index of the character in the range string, adjusted
+ * in the case of "A-B" ranges such that A would count as its
+ * normal index (say IA), B would count as IA + (B-A), and any
+ * character within the range as appropriate. We're not strictly
+ * guaranteed this fits within a wint_t, but if this is Unicode
+ * in 32 bits we have a fair amount of distance left over.
+ *
+ * mtp is used in the same circumstances. *mtp returns the match type:
+ * 0 for a standard character, else the PP_ index. It's not
+ * useful if the match failed.
+ */
+
/**/
-static int
-mb_patmatchrange(char *range, wchar_t ch)
+mod_export int
+mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
{
wchar_t r1, r2;
+ if (indptr)
+ *indptr = 0;
/*
* Careful here: unlike other strings, range is a NULL-terminated,
* metafied string, because we need to treat the Posix and hyphenated
@@ -3135,7 +3223,10 @@ mb_patmatchrange(char *range, wchar_t ch)
*/
while (*range) {
if (imeta(STOUC(*range))) {
- switch (STOUC(*range++) - STOUC(Meta)) {
+ int swtype = STOUC(*range++) - STOUC(Meta);
+ if (mtp)
+ *mtp = swtype;
+ switch (swtype) {
case 0:
/* ordinary metafied character */
range--;
@@ -3214,8 +3305,19 @@ mb_patmatchrange(char *range, wchar_t ch)
case PP_RANGE:
r1 = metacharinc(&range);
r2 = metacharinc(&range);
- if (r1 <= ch && ch <= r2)
+ if (r1 <= ch && ch <= r2) {
+ if (indptr)
+ *indptr += ch - r1;
return 1;
+ }
+ /* Careful not to screw up counting with bogus range */
+ if (indptr && r1 < r2) {
+ /*
+ * This gets incremented again below to get
+ * us past the range end. This is correct.
+ */
+ *indptr += r2 - r1;
+ }
break;
case PP_UNKWN:
DPUTS(1, "BUG: unknown posix range passed through.\n");
@@ -3224,21 +3326,130 @@ mb_patmatchrange(char *range, wchar_t ch)
DPUTS(1, "BUG: unknown metacharacter in range.");
break;
}
- } else if (metacharinc(&range) == ch)
+ } else if (metacharinc(&range) == ch) {
+ if (mtp)
+ *mtp = 0;
return 1;
+ }
+ if (indptr)
+ (*indptr)++;
}
return 0;
}
+
+#if 0
+/*
+ * This is effectively the reverse of mb_patmatchrange().
+ * Given a range descriptor of the same form, and an index into it,
+ * try to determine the character that is matched. If the index
+ * points to a [:...:] generic style match, set chr to WEOF and
+ * return the type in mtp instead. Return 1 if successful, 0 if
+ * there was no corresponding index. Note all pointer arguments
+ * must be non-null.
+ *
+ * TODO: for now the completion matching code does not handle
+ * multibyte. When it does, we will need either this, or
+ * patmatchindex(), but not both---unlike user-initiated pattern
+ * matching, multibyte mode in the line editor is always on when available.
+ */
+
/**/
+mod_export int
+mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
+{
+ wchar_t r1, r2, rchr;
+ wint_t rdiff;
+
+ *chr = WEOF;
+ *mtp = 0;
+
+ while (*range) {
+ if (imeta(STOUC(*range))) {
+ int swtype = STOUC(*range++) - STOUC(Meta);
+ switch (swtype) {
+ case 0:
+ range--;
+ rchr = metacharinc(&range);
+ if (!ind) {
+ *chr = (wint_t) rchr;
+ return 1;
+ }
+ break;
+
+ case PP_ALPHA:
+ case PP_ALNUM:
+ case PP_ASCII:
+ case PP_BLANK:
+ case PP_CNTRL:
+ case PP_DIGIT:
+ case PP_GRAPH:
+ case PP_LOWER:
+ case PP_PRINT:
+ case PP_PUNCT:
+ case PP_SPACE:
+ case PP_UPPER:
+ case PP_XDIGIT:
+ case PP_IDENT:
+ case PP_IFS:
+ case PP_IFSSPACE:
+ case PP_WORD:
+ if (!ind) {
+ *mtp = swtype;
+ return 1;
+ }
+ break;
+
+ case PP_RANGE:
+ r1 = metacharinc(&range);
+ r2 = metacharinc(&range);
+ rdiff = (wint_t)r2 - (wint_t)r1;
+ if (rdiff >= ind) {
+ *chr = (wint_t)r1 + ind;
+ return 1;
+ }
+ /* note the extra decrement to ind below */
+ ind -= rdiff;
+ break;
+ case PP_UNKWN:
+ DPUTS(1, "BUG: unknown posix range passed through.\n");
+ break;
+ default:
+ DPUTS(1, "BUG: unknown metacharacter in range.");
+ break;
+ }
+ } else {
+ rchr = metacharinc(&range);
+ if (!ind) {
+ *chr = (wint_t)rchr;
+ return 1;
+ }
+ }
+ if (!ind--)
+ break;
+ }
+
+ /* No corresponding index. */
+ return 0;
+}
#endif
/**/
-static int
-patmatchrange(char *range, int ch)
+#endif
+
+/*
+ * Identical function to mb_patmatchrange() above for single-byte
+ * characters.
+ */
+
+/**/
+mod_export int
+patmatchrange(char *range, int ch, int *indptr, int *mtp)
{
int r1, r2;
+ if (indptr)
+ *indptr = 0;
/*
* Careful here: unlike other strings, range is a NULL-terminated,
* metafied string, because we need to treat the Posix and hyphenated
@@ -3246,7 +3457,10 @@ patmatchrange(char *range, int ch)
*/
for (; *range; range++) {
if (imeta(STOUC(*range))) {
- switch (STOUC(*range)-STOUC(Meta)) {
+ int swtype = STOUC(*range) - STOUC(Meta);
+ if (mtp)
+ *mtp = swtype;
+ switch (swtype) {
case 0:
if (STOUC(*++range ^ 32) == ch)
return 1;
@@ -3326,8 +3540,13 @@ patmatchrange(char *range, int ch)
r2 = STOUC(UNMETA(range));
if (*range == Meta)
range++;
- if (r1 <= ch && ch <= r2)
+ if (r1 <= ch && ch <= r2) {
+ if (indptr)
+ *indptr += ch - r1;
return 1;
+ }
+ if (indptr && r1 < r2)
+ *indptr += r2 - r1;
break;
case PP_UNKWN:
DPUTS(1, "BUG: unknown posix range passed through.\n");
@@ -3336,9 +3555,100 @@ patmatchrange(char *range, int ch)
DPUTS(1, "BUG: unknown metacharacter in range.");
break;
}
- } else if (STOUC(*range) == ch)
+ } else if (STOUC(*range) == ch) {
+ if (mtp)
+ *mtp = 0;
return 1;
+ }
+ if (indptr)
+ (*indptr)++;
+ }
+ return 0;
+}
+
+/*
+ * Identical function to mb_patmatchindex() above for single-byte
+ * characters. Here -1 represents a character that needs a special type.
+ */
+
+/**/
+mod_export int
+patmatchindex(char *range, int ind, int *chr, int *mtp)
+{
+ int r1, r2, rdiff, rchr;
+
+ *chr = -1;
+ *mtp = 0;
+
+ for (; *range; range++) {
+ if (imeta(STOUC(*range))) {
+ int swtype = STOUC(*range) - STOUC(Meta);
+ switch (swtype) {
+ case 0:
+ /* ordinary metafied character */
+ rchr = STOUC(*++range) ^ 32;
+ if (!ind) {
+ *chr = rchr;
+ return 1;
+ }
+ break;
+
+ case PP_ALPHA:
+ case PP_ALNUM:
+ case PP_ASCII:
+ case PP_BLANK:
+ case PP_CNTRL:
+ case PP_DIGIT:
+ case PP_GRAPH:
+ case PP_LOWER:
+ case PP_PRINT:
+ case PP_PUNCT:
+ case PP_SPACE:
+ case PP_UPPER:
+ case PP_XDIGIT:
+ case PP_IDENT:
+ case PP_IFS:
+ case PP_IFSSPACE:
+ case PP_WORD:
+ if (!ind) {
+ *mtp = swtype;
+ return 1;
+ }
+ break;
+
+ case PP_RANGE:
+ range++;
+ r1 = STOUC(UNMETA(range));
+ METACHARINC(range);
+ r2 = STOUC(UNMETA(range));
+ if (*range == Meta)
+ range++;
+ rdiff = r2 - r1;
+ if (rdiff >= ind) {
+ *chr = r1 + ind;
+ return 1;
+ }
+ /* note the extra decrement to ind below */
+ ind -= rdiff;
+ break;
+ case PP_UNKWN:
+ DPUTS(1, "BUG: unknown posix range passed through.\n");
+ break;
+ default:
+ DPUTS(1, "BUG: unknown metacharacter in range.");
+ break;
+ }
+ } else {
+ if (!ind) {
+ *chr = STOUC(*range);
+ return 1;
+ }
+ }
+ if (!ind--)
+ break;
}
+
+ /* No corresponding index. */
return 0;
}
@@ -3382,14 +3692,14 @@ static int patrepeat(Upat p, char *charstart)
#ifdef MULTIBYTE_SUPPORT
wchar_t cr = CHARREF(scan, patinend);
if (patglobflags & GF_MULTIBYTE) {
- if (mb_patmatchrange(opnd, cr) ^
+ if (mb_patmatchrange(opnd, cr, NULL, NULL) ^
(P_OP(p) == P_ANYOF))
break;
- } else if (patmatchrange(opnd, (int)cr) ^
+ } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
(P_OP(p) == P_ANYOF))
break;
#else
- if (patmatchrange(opnd, CHARREF(scan, patinend)) ^
+ if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^
(P_OP(p) == P_ANYOF))
break;
#endif
diff --git a/Src/zsh.h b/Src/zsh.h
index e40761199..b5de54bc9 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -1307,6 +1307,48 @@ struct patprog {
#define PAT_HAS_EXCLUDP 0x0800 /* (internal): top-level path1~path2. */
#define PAT_LCMATCHUC 0x1000 /* equivalent to setting (#l) */
+/*
+ * Special match types used in character classes. These
+ * are represented as tokens, with Meta added. The character
+ * class is represented as a metafied string, with only these
+ * tokens special. Note that an active leading "!" or "^" for
+ * negation is not part of the string but is flagged in the
+ * surrounding context.
+ *
+ * These types are also used in character and equivalence classes
+ * in completion matching.
+ *
+ * This must be kept ordered by the array colon_stuffs in pattern.c.
+ */
+/* Special value for first definition */
+#define PP_FIRST 1
+/* POSIX-defined types: [:alpha:] etc. */
+#define PP_ALPHA 1
+#define PP_ALNUM 2
+#define PP_ASCII 3
+#define PP_BLANK 4
+#define PP_CNTRL 5
+#define PP_DIGIT 6
+#define PP_GRAPH 7
+#define PP_LOWER 8
+#define PP_PRINT 9
+#define PP_PUNCT 10
+#define PP_SPACE 11
+#define PP_UPPER 12
+#define PP_XDIGIT 13
+/* Zsh additions: [:IDENT:] etc. */
+#define PP_IDENT 14
+#define PP_IFS 15
+#define PP_IFSSPACE 16
+#define PP_WORD 17
+/* Special value for last definition */
+#define PP_LAST 17
+
+/* Unknown type. Not used in a valid token. */
+#define PP_UNKWN 18
+/* Range: token followed by the (possibly multibyte) start and end */
+#define PP_RANGE 19
+
/* Globbing flags: lower 8 bits gives approx count */
#define GF_LCMATCHUC 0x0100
#define GF_IGNCASE 0x0200