summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Stephenson <pws@users.sourceforge.net>2006-06-27 16:28:45 +0000
committerPeter Stephenson <pws@users.sourceforge.net>2006-06-27 16:28:45 +0000
commitf95a6a913c885932827e9c0219221f7de7ccdd79 (patch)
tree88e6d63cc94c4d5f0b934e12e0ad6b399bda9660
parentec1274f0a3c7f4d65ba7d1cda95a209159ed9ad4 (diff)
downloadzsh-f95a6a913c885932827e9c0219221f7de7ccdd79.tar.gz
zsh-f95a6a913c885932827e9c0219221f7de7ccdd79.zip
22524: searchable parameter subscripts with multibyte chars,
a few other fixes
-rw-r--r--ChangeLog3
-rw-r--r--Src/params.c191
-rw-r--r--Test/D07multibyte.ztst36
3 files changed, 206 insertions, 24 deletions
diff --git a/ChangeLog b/ChangeLog
index f293c01ff..34fc634ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2006-06-27 Peter Stephenson <pws@csr.com>
+ * 22524: Src/params.c, Test/D07multibyte.ztst: searchable
+ subscripts with multibyte characters.
+
* 22522: Src/utils.c: infinite loop with invalid character
in mb_metastrlen().
diff --git a/Src/params.c b/Src/params.c
index c5bfc79f5..f589a740e 100644
--- a/Src/params.c
+++ b/Src/params.c
@@ -934,11 +934,13 @@ isident(char *s)
* If supplied they are set to the length of the character before
* the index position and the one at the index position. If
* multibyte characters are not in use they are set to 1 for
- * consistency.
+ * consistency. Note they aren't fully handled if a2 is non-zero,
+ * since they aren't needed.
*
* Returns a raw offset into the value from the start or end (i.e.
* after the arithmetic for Meta and possible multibyte characters has
- * been taken into account).
+ * been taken into account). This actually gives the offset *after*
+ * the character in question; subtract *prevcharlen if necessary.
*/
/**/
@@ -1178,16 +1180,23 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
t += (lastcharlen = MB_METACHARLEN(t));
/* for consistency, keep any remainder off the end */
r = (zlong)(t - s) + nchars;
- if (prevcharlen)
+ if (prevcharlen && !nchars /* ignore if off the end */)
*prevcharlen = lastcharlen;
if (nextcharlen && *t)
*nextcharlen = MB_METACHARLEN(t);
+ } else if (r == 0) {
+ if (prevcharlen)
+ *prevcharlen = 0;
+ if (nextcharlen && *s) {
+ MB_METACHARINIT();
+ *nextcharlen = MB_METACHARLEN(s);
+ }
} else {
zlong nchars = (zlong)MB_METASTRLEN(s) + r;
if (nchars < 0) {
- /* invalid but keep index anyway */
- r = nchars;
+ /* make sure this isn't valid as a raw pointer */
+ r -= (zlong)strlen(s);
} else {
MB_METACHARINIT();
for (t = s; nchars && *t; nchars--)
@@ -1300,57 +1309,188 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
}
return a2 ? -1 : 0;
} else {
+ /* Searching characters */
+ int slen;
d = getstrvalue(v);
if (!d || !*d)
return 0;
- len = strlen(d);
+ /*
+ * beg and len are character counts, not raw offsets.
+ * Remember we need to return a raw offset.
+ */
+ len = MB_METASTRLEN(d);
+ slen = strlen(d);
if (beg < 0)
beg += len;
+ MB_METACHARINIT();
if (beg >= 0 && beg < len) {
- char *de = d + len;
+ char *de = d + slen;
if (a2) {
+ /*
+ * Second argument: we don't need to
+ * handle prevcharlen or nextcharlen, but
+ * we do need to handle characters appropriately.
+ */
if (down) {
+ int nmatches = 0;
+ char *lastpos = NULL;
+
if (!hasbeg)
beg = len;
- for (r = beg, t = d + beg; t >= d; r--, t--) {
+
+ /*
+ * See below: we have to move forward,
+ * but need to count from the end.
+ */
+ for (t = d, r = 0; r <= beg; r++) {
sav = *t;
*t = '\0';
- if (pattry(pprog, d)
- && !--num) {
- *t = sav;
- return r;
+ if (pattry(pprog, d)) {
+ nmatches++;
+ lastpos = t;
}
*t = sav;
+ if (t == de)
+ break;
+ t += MB_METACHARLEN(t);
}
- } else
- for (r = beg, t = d + beg; t <= de; r++, t++) {
+
+ if (nmatches >= num) {
+ if (num > 1) {
+ nmatches -= num;
+ MB_METACHARINIT();
+ for (t = d, r = 0; ; r++) {
+ sav = *t;
+ *t = '\0';
+ if (pattry(pprog, d) &&
+ nmatches-- == 0) {
+ lastpos = t;
+ *t = sav;
+ break;
+ }
+ *t = sav;
+ t += MB_METACHARLEN(t);
+ }
+ }
+ /* else lastpos is already OK */
+
+ return lastpos - d;
+ }
+ } else {
+ /*
+ * This handling of the b flag
+ * gives odd results, but this is the
+ * way it's always worked.
+ */
+ for (t = d; beg && t <= de; beg--)
+ t += MB_METACHARLEN(t);
+ for (;;) {
sav = *t;
*t = '\0';
- if (pattry(pprog, d) &&
- !--num) {
+ if (pattry(pprog, d) && !--num) {
*t = sav;
- return r;
+ /*
+ * This time, don't increment
+ * pointer, since it's already
+ * after everything we matched.
+ */
+ return t - d;
}
*t = sav;
+ if (t == de)
+ break;
+ t += MB_METACHARLEN(t);
}
+ }
} else {
+ /*
+ * First argument: this is the only case
+ * where we need prevcharlen and nextcharlen.
+ */
+ int lastcharlen;
+
if (down) {
+ int nmatches = 0;
+ char *lastpos = NULL;
+
if (!hasbeg)
beg = len;
+
+ /*
+ * We can only move forward through
+ * multibyte strings, so record the
+ * matches.
+ * Unfortunately the count num works
+ * from the end, so it's easy to get the
+ * last one but we need to repeat if
+ * we want another one.
+ */
+ for (t = d, r = 0; r <= beg; r++) {
+ if (pattry(pprog, t)) {
+ nmatches++;
+ lastpos = t;
+ }
+ if (t == de)
+ break;
+ t += MB_METACHARLEN(t);
+ }
+
+ if (nmatches >= num) {
+ if (num > 1) {
+ /*
+ * Need to start again and repeat
+ * to get the right match.
+ */
+ nmatches -= num;
+ MB_METACHARINIT();
+ for (t = d, r = 0; ; r++) {
+ if (pattry(pprog, t) &&
+ nmatches-- == 0) {
+ lastpos = t;
+ break;
+ }
+ t += MB_METACHARLEN(t);
+ }
+ }
+ /* else lastpos is already OK */
+
+ /* return pointer after matched char */
+ lastpos +=
+ (lastcharlen = MB_METACHARLEN(lastpos));
+ if (prevcharlen)
+ *prevcharlen = lastcharlen;
+ if (nextcharlen)
+ *nextcharlen = MB_METACHARLEN(lastpos);
+ return lastpos - d;
+ }
+
for (r = beg + 1, t = d + beg; t >= d; r--, t--) {
if (pattry(pprog, t) &&
!--num)
return r;
}
- } else
- for (r = beg + 1, t = d + beg; t <= de; r++, t++)
- if (pattry(pprog, t) &&
- !--num)
- return r;
+ } else {
+ for (t = d; beg && t <= de; beg--)
+ t += MB_METACHARLEN(t);
+ for (;;) {
+ if (pattry(pprog, t) && !--num) {
+ /* return pointer after matched char */
+ t += (lastcharlen = MB_METACHARLEN(t));
+ if (prevcharlen)
+ *prevcharlen = lastcharlen;
+ if (nextcharlen)
+ *nextcharlen = MB_METACHARLEN(t);
+ return t - d;
+ }
+ if (t == de)
+ break;
+ t += MB_METACHARLEN(t);
+ }
+ }
}
}
- return down ? 0 : len + 1;
+ return down ? 0 : slen + 1;
}
}
}
@@ -1429,9 +1569,12 @@ getindex(char **pptr, Value v, int dq)
}
}
/* if start was too big, keep the difference */
- start = nstart + (target - p) + startprevlen;
+ start = nstart + (target - p) + 1;
} else {
zlong startoff = start + strlen(t);
+#ifdef DEBUG
+ dputs("BUG: can't have negative inverse offsets???");
+#endif
if (startoff < 0) {
/* invalid: keep index but don't dereference */
start = startoff;
diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst
index 56d7fb7fd..a1937cc68 100644
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@@ -82,6 +82,42 @@
>x
>9 9 x t
+ s=é
+ print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
+0:Out of range subscripts with multibyte characters
+>AA BéB CéC DéD EE
+
print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
0:Reverse indexing with multibyte characters
>2 4 éné
+
+ print ${a[(r)én,(r)éb]}
+0:Subscript searching with multibyte characters
+>énéb
+
+ print ${a[(rb:1:)é,-1]}
+ print ${a[(rb:2:)é,-1]}
+ print ${a[(rb:3:)é,-1]}
+ print ${a[(rb:4:)é,-1]}
+ print ${a[(rb:5:)é,-1]}
+0:Subscript searching with initial offset
+>énébreux
+>énébreux
+>ébreux
+>ébreux
+>
+
+ print ${a[(rn:1:)é,-1]}
+ print ${a[(rn:2:)é,-1]}
+ print ${a[(rn:3:)é,-1]}
+0:Subscript searching with count
+>énébreux
+>ébreux
+>
+
+ print ${a[(R)én,(R)éb]}
+0:Backward subscript searching with multibyte characters
+>énéb
+
+# Starting offsets with (R) seem to be so strange as to be hardly
+# worth testing.