summaryrefslogtreecommitdiff
path: root/Src/utils.c
diff options
context:
space:
mode:
authorJun-ichi Takimoto <takimoto-j@kba.biglobe.ne.jp>2023-09-27 01:56:47 +0900
committerJun-ichi Takimoto <takimoto-j@kba.biglobe.ne.jp>2023-09-27 01:56:47 +0900
commit02e33c54d85208c7d9b96d91a26d04069ff19ed2 (patch)
tree2d4feecc28498ffd5cac25dbec09a319de380109 /Src/utils.c
parente4e9afe373479076ee448b16944a421836ba5a40 (diff)
downloadzsh-02e33c54d85208c7d9b96d91a26d04069ff19ed2.tar.gz
zsh-02e33c54d85208c7d9b96d91a26d04069ff19ed2.zip
52169: a few more improvements of (#) flag
fix (#X) in C locale in FreeBSD, DragonFly, NetBSD. Negative values such as ${(#X):--1} are now error. UCS4 is limited to < 0x8000_0000 (in OSes without __STDC_ISO_10646__).
Diffstat (limited to 'Src/utils.c')
-rw-r--r--Src/utils.c230
1 files changed, 114 insertions, 116 deletions
diff --git a/Src/utils.c b/Src/utils.c
index 7040d0954..7028c155f 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -6672,11 +6672,14 @@ dquotedzputs(char const *s, FILE *stream)
# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
/* Convert a character from UCS4 encoding to UTF-8 */
-static size_t
+static int
ucs4toutf8(char *dest, unsigned int wval)
{
- size_t len;
+ int len;
+ /* UCS4 is now equvalent to UTF-32 and limited to 0 - 0x10_FFFF.
+ * This function accepts 0 - 0x7FFF_FFFF (old range of UCS4) to be
+ * compatible with wctomb(3) (in UTF-8 locale) on Linux. */
if (wval < 0x80)
len = 1;
else if (wval < 0x800)
@@ -6687,8 +6690,12 @@ ucs4toutf8(char *dest, unsigned int wval)
len = 4;
else if (wval < 0x4000000)
len = 5;
- else
+ else if (wval < 0x80000000)
len = 6;
+ else {
+ zerr("character not in range");
+ return -1;
+ }
switch (len) { /* falls through except to the last case */
case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
@@ -6705,30 +6712,89 @@ ucs4toutf8(char *dest, unsigned int wval)
}
#endif
+/* Convert UCS4 to a multibyte character in current locale.
+ * Result is saved in buf (must be at least MB_CUR_MAX bytes long).
+ * Returns the number of bytes saved in buf, or -1 if conversion fails. */
-/*
- * The following only occurs once or twice in the code, but in different
- * places depending how character set conversion is implemented.
- */
-#define CHARSET_FAILED() \
- if (how & GETKEY_DOLLAR_QUOTE) { \
- while ((*tdest++ = *++s)) { \
- if (how & GETKEY_UPDATE_OFFSET) { \
- if (s - sstart > *misc) \
- (*misc)++; \
- } \
- if (*s == Snull) { \
- *len = (s - sstart) + 1; \
- *tdest = '\0'; \
- return buf; \
- } \
- } \
- *len = tdest - buf; \
- return buf; \
- } \
- *t = '\0'; \
- *len = t - buf; \
- return buf
+/**/
+int
+ucs4tomb(unsigned int wval, char *buf)
+{
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
+ int count = wctomb(buf, (wchar_t)wval);
+ if (count == -1)
+ zerr("character not in range");
+ return count;
+#else /* !(HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__) */
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
+ if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
+ return ucs4toutf8(buf, wval);
+ } else {
+# ifdef HAVE_ICONV
+ iconv_t cd;
+ char inbuf[4], *bsave = buf;
+ ICONV_CONST char *inptr = inbuf;
+ size_t inbytes = 4, outbytes = 6;
+ const char *codesetstr = nl_langinfo(CODESET);
+ size_t count;
+ int i;
+
+ /*
+ * If the code set isn't handled, we'd better assume it's US-ASCII
+ * rather than just failing hopelessly. Solaris has a weird habit
+ * of returning 646. This is handled by the native iconv(), but
+ * not by GNU iconv; what's more, some versions of the native iconv
+ * don't handle standard names like ASCII.
+ *
+ * This should only be a problem if there's a mismatch between the
+ * NLS and the iconv in use, which probably only means if libiconv
+ * is in use. We checked at configure time if our libraries pulled
+ * in _libiconv_version, which should be a good test.
+ *
+ * It shouldn't ever be NULL, but while we're being paranoid...
+ */
+# ifdef ICONV_FROM_LIBICONV
+ if (!codesetstr || !*codesetstr)
+ codesetstr = "US-ASCII";
+# endif
+ cd = iconv_open(codesetstr, "UCS-4BE");
+# ifdef ICONV_FROM_LIBICONV
+ if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
+ codesetstr = "US-ASCII";
+ cd = iconv_open(codesetstr, "UCS-4BE");
+ }
+# endif
+ if (cd == (iconv_t)-1) {
+ zerr("cannot do charset conversion (iconv failed)");
+ return -1;
+ }
+
+ /* store value in big endian form */
+ for (i=3; i>=0; i--) {
+ inbuf[i] = wval & 0xff;
+ wval >>= 8;
+ }
+ count = iconv(cd, &inptr, &inbytes, &buf, &outbytes);
+ iconv_close(cd);
+ if (count) {
+ /* -1 indicates error. Positive value means number of "invalid"
+ * (or "non-reversible") conversions, which we consider as
+ * "out-of-range" characters. */
+ zerr("character not in range");
+ return -1;
+ }
+ return buf - bsave;
+# else /* !HAVE_ICONV */
+ zerr("cannot do charset conversion (iconv not available)");
+ return -1;
+# endif /* HAVE_ICONV */
+ }
+# else /* !(HAVE_NL_LANGINFO && CODESET) */
+ zerr("cannot do charset conversion (NLS not supported)");
+ return -1;
+# endif /* HAVE_NL_LANGINFO && CODESET */
+#endif /* HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__ */
+}
/*
* Decode a key string, turning it into the literal characters.
@@ -6785,21 +6851,6 @@ getkeystring(char *s, int *len, int how, int *misc)
char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf = NULL;
char svchar = '\0';
int meta = 0, control = 0, ignoring = 0;
- int i;
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
- wint_t wval;
- int count;
-#else
- unsigned int wval;
-# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
-# if defined(HAVE_ICONV)
- iconv_t cd;
- char inbuf[4];
- size_t inbytes, outbytes;
-# endif
- size_t count;
-# endif
-#endif
DPUTS((how & GETKEY_UPDATE_OFFSET) &&
(how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)),
@@ -6864,7 +6915,8 @@ getkeystring(char *s, int *len, int how, int *misc)
}
for (; *s; s++) {
if (*s == '\\' && s[1]) {
- int miscadded;
+ int miscadded, count, i;
+ unsigned int wval;
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
(*misc)--;
miscadded = 1;
@@ -6979,86 +7031,32 @@ getkeystring(char *s, int *len, int how, int *misc)
*misc = wval;
return s+1;
}
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
- count = wctomb(t, (wchar_t)wval);
+ count = ucs4tomb(wval, t);
if (count == -1) {
- zerr("character not in range");
- CHARSET_FAILED();
+ if (how & GETKEY_DOLLAR_QUOTE) {
+ while ((*tdest++ = *++s)) {
+ if (how & GETKEY_UPDATE_OFFSET) {
+ if (s - sstart > *misc)
+ (*misc)++;
+ }
+ if (*s == Snull) {
+ *len = (s - sstart) + 1;
+ *tdest = '\0';
+ return buf;
+ }
+ }
+ *len = tdest - buf;
+ }
+ else {
+ *t = '\0';
+ *len = t - buf;
+ }
+ return buf;
}
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
(*misc) += count;
t += count;
-# else
-# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
- if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
- count = ucs4toutf8(t, wval);
- t += count;
- if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
- (*misc) += count;
- } else {
-# ifdef HAVE_ICONV
- ICONV_CONST char *inptr = inbuf;
- const char *codesetstr = nl_langinfo(CODESET);
- inbytes = 4;
- outbytes = 6;
- /* store value in big endian form */
- for (i=3;i>=0;i--) {
- inbuf[i] = wval & 0xff;
- wval >>= 8;
- }
- /*
- * If the code set isn't handled, we'd better
- * assume it's US-ASCII rather than just failing
- * hopelessly. Solaris has a weird habit of
- * returning 646. This is handled by the
- * native iconv(), but not by GNU iconv; what's
- * more, some versions of the native iconv don't
- * handle standard names like ASCII.
- *
- * This should only be a problem if there's a
- * mismatch between the NLS and the iconv in use,
- * which probably only means if libiconv is in use.
- * We checked at configure time if our libraries
- * pulled in _libiconv_version, which should be
- * a good test.
- *
- * It shouldn't ever be NULL, but while we're
- * being paranoid...
- */
-#ifdef ICONV_FROM_LIBICONV
- if (!codesetstr || !*codesetstr)
- codesetstr = "US-ASCII";
-#endif
- cd = iconv_open(codesetstr, "UCS-4BE");
-#ifdef ICONV_FROM_LIBICONV
- if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
- codesetstr = "US-ASCII";
- cd = iconv_open(codesetstr, "UCS-4BE");
- }
-#endif
- if (cd == (iconv_t)-1) {
- zerr("cannot do charset conversion (iconv failed)");
- CHARSET_FAILED();
- }
- count = iconv(cd, &inptr, &inbytes, &t, &outbytes);
- iconv_close(cd);
- if (count == (size_t)-1) {
- zerr("character not in range");
- CHARSET_FAILED();
- }
- if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
- (*misc) += count;
-# else
- zerr("cannot do charset conversion (iconv not available)");
- CHARSET_FAILED();
-# endif
- }
-# else
- zerr("cannot do charset conversion (NLS not supported)");
- CHARSET_FAILED();
-# endif
-# endif
if (how & GETKEY_DOLLAR_QUOTE) {
char *t2;
for (t2 = tbuf; t2 < t; t2++) {