summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--Src/params.c152
-rw-r--r--Src/utils.c106
-rw-r--r--Src/zsh.h6
-rw-r--r--Test/B02typeset.ztst17
5 files changed, 259 insertions, 26 deletions
diff --git a/ChangeLog b/ChangeLog
index 5a95c227a..4e1c50e2c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2006-06-26 Peter Stephenson <pws@csr.com>
+ * 22518: Src/params.c, Src/utils,c, Src/zsh.h,
+ Test/B02typeset.ztst: Initial go at making parameter subscripts
+ use multibyte characters.
+
* 22516: Src/parse.c: error evaluating "func()" didn't pop
the command stack.
diff --git a/Src/params.c b/Src/params.c
index 7deee4288..c5bfc79f5 100644
--- a/Src/params.c
+++ b/Src/params.c
@@ -918,9 +918,33 @@ isident(char *s)
return !ss[1];
}
+/*
+ * Parse a single argument to a parameter subscript.
+ * The subscripts starts at *str; *str is updated (input/output)
+ *
+ * *inv is set to indicate if the subscript is reversed (output)
+ * v is the Value for the parameter being accessed (input; note
+ * v->isarr may be modified, and if v is a hash the parameter will
+ * be updated to the element of the hash)
+ * a2 is 1 if this is the second subscript of a range (input)
+ * *w is only set if we need to find the end of a word (input; should
+ * be set to 0 by the caller).
+ *
+ * The final two arguments are to support multibyte characters.
+ * If supplied they are set to the length of the character before
+ * the index position and the one at the index position. If
+ * multibyte characters are not in use they are set to 1 for
+ * consistency.
+ *
+ * Returns a raw offset into the value from the start or end (i.e.
+ * after the arithmetic for Meta and possible multibyte characters has
+ * been taken into account).
+ */
+
/**/
static zlong
-getarg(char **str, int *inv, Value v, int a2, zlong *w)
+getarg(char **str, int *inv, Value v, int a2, zlong *w,
+ int *prevcharlen, int *nextcharlen)
{
int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash;
int keymatch = 0, needtok = 0;
@@ -929,6 +953,10 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)
Patprog pprog = NULL;
ishash = (v->pm && PM_TYPE(v->pm->node.flags) == PM_HASHED);
+ if (prevcharlen)
+ *prevcharlen = 1;
+ if (nextcharlen)
+ *nextcharlen = 1;
/* first parse any subscription flags */
if (v->pm && (*s == '(' || *s == Inpar)) {
@@ -1133,17 +1161,43 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)
return (a2 ? s : d + 1) - t;
} else if (!v->isarr && !word) {
+ int lastcharlen = 1;
s = getstrvalue(v);
+ /*
+ * Note for the confused (= pws): the index r we
+ * have so far is that specified by the user. The value
+ * passed back is an offset from the start or end of
+ * the string. Hence it needs correcting at least
+ * for Meta characters and maybe for multibyte characters.
+ */
if (r > 0) {
- for (t = s + r - 1; *s && s < t;)
- if (*s++ == Meta)
- s++, t++, r++;
+ zlong nchars = r;
+
+ MB_METACHARINIT();
+ for (t = s; nchars && *t; nchars--)
+ t += (lastcharlen = MB_METACHARLEN(t));
+ /* for consistency, keep any remainder off the end */
+ r = (zlong)(t - s) + nchars;
+ if (prevcharlen)
+ *prevcharlen = lastcharlen;
+ if (nextcharlen && *t)
+ *nextcharlen = MB_METACHARLEN(t);
} else {
- r += ztrlen(s);
- for (t = s + r; *s && s < t; r--)
- if (*s++ == Meta)
- t++, r++;
- r -= strlen(s);
+ zlong nchars = (zlong)MB_METASTRLEN(s) + r;
+
+ if (nchars < 0) {
+ /* invalid but keep index anyway */
+ r = nchars;
+ } else {
+ MB_METACHARINIT();
+ for (t = s; nchars && *t; nchars--)
+ t += (lastcharlen = MB_METACHARLEN(t));
+ r = - (zlong)strlen(t); /* keep negative */
+ if (prevcharlen)
+ *prevcharlen = lastcharlen;
+ if (nextcharlen && *t)
+ *nextcharlen = MB_METACHARLEN(t);
+ }
}
}
} else {
@@ -1338,19 +1392,57 @@ getindex(char **pptr, Value v, int dq)
s += 2;
} else {
zlong we = 0, dummy;
+ int startprevlen, startnextlen;
- start = getarg(&s, &inv, v, 0, &we);
+ start = getarg(&s, &inv, v, 0, &we, &startprevlen, &startnextlen);
if (inv) {
if (!v->isarr && start != 0) {
char *t, *p;
t = getstrvalue(v);
+ /*
+ * Note for the confused (= pws): this is an inverse
+ * offset so at this stage we need to convert from
+ * the immediate offset into the value that we have
+ * into a logical character position.
+ */
if (start > 0) {
- for (p = t + start - 1; p-- > t; )
- if (*p == Meta)
- start--;
- } else
- start = -ztrlen(t + start + strlen(t));
+ int nstart = 0;
+ char *target = t + start - startprevlen;
+
+ p = t;
+ MB_METACHARINIT();
+ while (*p) {
+ /*
+ * move up characters, counting how many we
+ * found
+ */
+ p += MB_METACHARLEN(p);
+ if (p < target)
+ nstart++;
+ else {
+ if (p == target)
+ nstart++;
+ else
+ p = target; /* pretend we hit exactly */
+ break;
+ }
+ }
+ /* if start was too big, keep the difference */
+ start = nstart + (target - p) + startprevlen;
+ } else {
+ zlong startoff = start + strlen(t);
+ if (startoff < 0) {
+ /* invalid: keep index but don't dereference */
+ start = startoff;
+ } else {
+ /* find start in full characters */
+ MB_METACHARINIT();
+ for (p = t; p < t + startoff;)
+ p += MB_METACHARLEN(p);
+ start = - MB_METASTRLEN(p);
+ }
+ }
}
if (start > 0 && (isset(KSHARRAYS) || (v->pm->node.flags & PM_HASHED)))
start--;
@@ -1373,15 +1465,21 @@ getindex(char **pptr, Value v, int dq)
if ((com = (*s == ','))) {
s++;
- end = getarg(&s, &inv, v, 1, &dummy);
+ end = getarg(&s, &inv, v, 1, &dummy, NULL, NULL);
} else {
end = we ? we : start;
}
- if (start != end) com = 1;
+ if (start != end)
+ com = 1;
+ /*
+ * Somehow the logic sometimes forces us to use the previous
+ * or next character to what we would expect, which is
+ * why we had to calculate them in getarg().
+ */
if (start > 0)
- start--;
+ start -= startprevlen;
else if (start == 0 && end == 0)
- end++;
+ end = startnextlen;
if (s == tbrack) {
s++;
if (v->isarr && !com &&
@@ -1578,13 +1676,19 @@ getstrvalue(Value v)
if (v->start < 0)
v->start = 0;
}
- if (v->end < 0)
- v->end += strlen(s) + 1;
+ if (v->end < 0) {
+ v->end += strlen(s);
+ if (v->end >= 0) {
+ char *eptr = s + v->end;
+ if (*eptr)
+ v->end += MB_METACHARLEN(eptr);
+ }
+ }
s = (v->start > (int)strlen(s)) ? dupstring("") : dupstring(s + v->start);
if (v->end <= v->start)
s[0] = '\0';
else if (v->end - v->start <= (int)strlen(s))
- s[v->end - v->start + (s[v->end - v->start - 1] == Meta)] = '\0';
+ s[v->end - v->start] = '\0';
return s;
}
@@ -2791,7 +2895,7 @@ char *
tiedarrgetfn(Param pm)
{
struct tieddata *dptr = (struct tieddata *)pm->u.data;
- return *dptr->arrptr ? zjoin(*dptr->arrptr, dptr->joinchar, 1) : "";
+ return *dptr->arrptr ? zjoin(*dptr->arrptr, STOUC(dptr->joinchar), 1) : "";
}
/**/
@@ -3463,7 +3567,7 @@ arrfixenv(char *s, char **t)
return;
if (pm->node.flags & PM_TIED)
- joinchar = ((struct tieddata *)pm->u.data)->joinchar;
+ joinchar = STOUC(((struct tieddata *)pm->u.data)->joinchar);
else
joinchar = ':';
diff --git a/Src/utils.c b/Src/utils.c
index 2d7bf2ec6..583945ac5 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -3683,6 +3683,112 @@ mb_width(const char *s)
return width;
}
+static mbstate_t mb_shiftstate;
+
+/*
+ * Initialise multibyte state: called before a sequence of
+ * mb_metacharlen().
+ */
+
+/**/
+void
+mb_metacharinit(void)
+{
+ memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+}
+
+/*
+ * Length of metafied string s which contains the next multibyte
+ * character; single (possibly metafied) character if string is not null
+ * but character is not valid (e.g. possibly incomplete at end of string).
+ * Returned value is guaranteed not to reach beyond the end of the
+ * string (assuming correct metafication).
+ */
+
+/**/
+int
+mb_metacharlen(char *s)
+{
+ char inchar, *ptr;
+ size_t ret;
+ wchar_t wc;
+
+ if (!isset(MULTIBYTE))
+ return 1 + (*s == Meta);
+
+ ret = MB_INVALID;
+ for (ptr = s; *ptr; ) {
+ if (*ptr == Meta)
+ inchar = *++ptr ^ 32;
+ else
+ inchar = *ptr;
+ ptr++;
+ ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+ if (ret == MB_INVALID)
+ break;
+ if (ret == MB_INCOMPLETE)
+ continue;
+ return ptr - s;
+ }
+
+ /* No valid multibyte sequence */
+ memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+ if (ptr > s)
+ return 1 + (*s == Meta); /* Treat as single byte character */
+ else
+ return 0; /* Probably shouldn't happen */
+}
+
+/*
+ * Total number of multibyte characters in metafied string s.
+ * Same answer as iterating mb_metacharlen() and counting calls
+ * until end of string.
+ */
+
+/**/
+int
+mb_metastrlen(char *ptr)
+{
+ char inchar, *laststart;
+ size_t ret;
+ wchar_t wc;
+ int num, num_in_char;
+
+ if (!isset(MULTIBYTE))
+ return ztrlen(ptr);
+
+ laststart = ptr;
+ ret = MB_INVALID;
+ num = num_in_char = 0;
+
+ memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+ while (*ptr) {
+ if (*ptr == Meta)
+ inchar = *++ptr ^ 32;
+ else
+ inchar = *ptr;
+ ptr++;
+ ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+ if (ret == MB_INCOMPLETE) {
+ num_in_char++;
+ } else {
+ if (ret == MB_INVALID) {
+ /* Reset, treat as single character */
+ memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+ ptr = laststart + (*laststart == Meta) + 1;
+ } else
+ laststart = ptr;
+ num++;
+ num_in_char = 0;
+ }
+ }
+
+ /* If incomplete, treat remainder as trailing single bytes */
+ return num + num_in_char;
+}
+
/**/
#endif /* MULTIBYTE_SUPPORT */
diff --git a/Src/zsh.h b/Src/zsh.h
index 092e05c0c..31609d3c5 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -1926,6 +1926,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
#ifdef MULTIBYTE_SUPPORT
#define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0)
+#define MB_METACHARINIT() mb_metacharinit()
+#define MB_METACHARLEN(str) mb_metacharlen(str)
+#define MB_METASTRLEN(str) mb_metastrlen(str)
#define MB_INCOMPLETE ((size_t)-2)
#define MB_INVALID ((size_t)-1)
@@ -1946,6 +1949,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
#define ZWS(s) L ## s
#else
+#define MB_METACHARINIT()
+#define MB_METACHARLEN(str) (*(str) == Meta ? 2 : 1)
+#define MB_METASTRLEN(str) ztrlen(str)
/* Leave character or string as is. */
#define ZWC(c) c
diff --git a/Test/B02typeset.ztst b/Test/B02typeset.ztst
index fc9d9c161..bbc00a2ea 100644
--- a/Test/B02typeset.ztst
+++ b/Test/B02typeset.ztst
@@ -182,13 +182,26 @@
>l o c a l
>l:o:c:a l o c a
+ (setopt NO_multibyte cbases
+ LC_ALL=C 2>/dev/null
typeset -T SCALAR=$'l\x83o\x83c\x83a\x83l' array $'\x83'
print $array
typeset -U SCALAR
- print $SCALAR $array
+ for (( i = 1; i <= ${#SCALAR}; i++ )); do
+ char=$SCALAR[i]
+ print $(( [#16] #char ))
+ done
+ print $array)
0:Tied parameters and uniquified arrays with meta-character as separator
>l o c a l
->lƒoƒcƒa l o c a
+>0x6C
+>0x83
+>0x6F
+>0x83
+>0x63
+>0x83
+>0x61
+>l o c a
typeset -T SCALAR=$'l\000o\000c\000a\000l' array $'\000'
typeset -U SCALAR