5 files changed, 82 insertions, 14 deletions
diff --git a/ChangeLog b/ChangeLog
index 222789bc2..a3f194dee 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2010-03-22  Peter Stephenson  <p.w.stephenson@ntlworld.com>
+
+	* 27812: Doc/Zsh/zle.yo, Src/Zle/zle.h, Src/Zle/zle_refresh.c,
+	Src/Zle/zle_utils.c: when wchar_t contains Unicode code points,
+	use private area to put bytes that don't form characters for
+	special display.
+
 2010-03-22  Peter Stephenson  <pws@csr.com>
 
 	* 27822: Src/hist.c, Src/lex.c, Src/zle_params.c,
@@ -12949,5 +12956,5 @@
 
 *****************************************************
 * This is used by the shell to define $ZSH_PATCHLEVEL
-* $Revision: 1.4941 $
+* $Revision: 1.4942 $
 *****************************************************
diff --git a/Doc/Zsh/zle.yo b/Doc/Zsh/zle.yo
index 91c13a563..0e2fea5bd 100644
--- a/Doc/Zsh/zle.yo
+++ b/Doc/Zsh/zle.yo
@@ -2286,6 +2286,20 @@ angle brackets.  The number is the code point of the character in the wide
 character set; this may or may not be Unicode, depending on the operating
 system.
 )
+item(Invalid multibyte characters)(
+If the tt(MULTIBYTE) option is in effect, any sequence of one or more
+bytes that does not form a valid character in the current character
+set is treated as a series of bytes each shown as a special character.
+This case can be distinguished from other unprintable characters
+as the bytes are represented as two hexadecimal digits between angle
+brackets, as distinct from the four or eight digits that are used for
+unprintable characters that are nonetheless valid in the current
+character set.
+
+Not all systems support this: for it to work, the system's representation of
+wide characters must be code values from the Universal Character Set,
+as defined by IS0 10646 (also known as Unicode).
+)
 enditem()
 
 If tt(zle_highlight) is not set or no value applies to a particular
diff --git a/Src/Zle/zle.h b/Src/Zle/zle.h
index 577a4442f..32f3e59f6 100644
--- a/Src/Zle/zle.h
+++ b/Src/Zle/zle.h
@@ -419,6 +419,20 @@ typedef struct {
 typedef REFRESH_ELEMENT *REFRESH_STRING;
 
 
+#if defined(MULTIBYTE_SUPPORT) && defined(__STDC_ISO_10646__)
+#define ZSH_INVALID_WCHAR_BASE	(0xe000U)
+#define ZSH_INVALID_WCHAR_TEST(x)			\
+    ((unsigned)(x) >= ZSH_INVALID_WCHAR_BASE &&		\
+     (unsigned)(x) <= (ZSH_INVALID_WCHAR_BASE + 255u))
+#define ZSH_INVALID_WCHAR_TO_CHAR(x)			\
+    ((char)((unsigned)(x) - ZSH_INVALID_WCHAR_BASE))
+#define ZSH_INVALID_WCHAR_TO_INT(x)			\
+    ((int)((unsigned)(x) - ZSH_INVALID_WCHAR_BASE))
+#define ZSH_CHAR_TO_INVALID_WCHAR(x)		\
+    ((wchar_t)(STOUC(x) + ZSH_INVALID_WCHAR_BASE))
+#endif
+
+
 #ifdef DEBUG
 #define METACHECK()		\
 	DPUTS(zlemetaline == NULL, "line not metafied")
diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c
index 8604317f3..352dcf0d6 100644
--- a/Src/Zle/zle_refresh.c
+++ b/Src/Zle/zle_refresh.c
@@ -1263,7 +1263,11 @@ zrefresh(void)
 	    }
 	}
 #ifdef MULTIBYTE_SUPPORT
-	else if (iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+	else if (
+#ifdef __STDC_ISO_10646__
+		 !ZSH_INVALID_WCHAR_TEST(*t) &&
+#endif
+		 iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
 	    int ichars;
 	    if (width > rpms.sen - rpms.s) {
 		int started = 0;
@@ -1367,6 +1371,12 @@ zrefresh(void)
 	    wchar_t wc;
 	    int started = 0;
 
+#ifdef __STDC_ISO_10646__
+	    if (ZSH_INVALID_WCHAR_TEST(*t)) {
+		int c = ZSH_INVALID_WCHAR_TO_INT(*t);
+		sprintf(dispchars, "<%.02x>", c);
+	    } else
+#endif
 	    if ((unsigned)*t > 0xffffU) {
 		sprintf(dispchars, "<%.08x>", (unsigned)*t);
 	    } else {
diff --git a/Src/Zle/zle_utils.c b/Src/Zle/zle_utils.c
index 2b2da7dcd..cc84eb8bb 100644
--- a/Src/Zle/zle_utils.c
+++ b/Src/Zle/zle_utils.c
@@ -120,11 +120,19 @@ zlecharasstring(ZLE_CHAR_T inchar, char *buf)
     size_t ret;
     char *ptr;
 
-    ret = wctomb(buf, inchar);
-    if (ret <= 0) {
-	/* Ick. */
-	buf[0] = '?';
-	return 1;
+#ifdef __STDC_ISO_10646__
+    if (ZSH_INVALID_WCHAR_TEST(inchar)) {
+	buf[0] = ZSH_INVALID_WCHAR_TO_CHAR(inchar);
+	ret = 1;
+    } else
+#endif
+    {
+	ret = wctomb(buf, inchar);
+	if (ret <= 0) {
+	    /* Ick. */
+	    buf[0] = '?';
+	    return 1;
+	}
     }
     ptr = buf + ret - 1;
     for (;;) {
@@ -196,13 +204,20 @@ zlelineasstring(ZLE_STRING_T instr, int inll, int incs, int *outllp,
     for (i=0; i < inll; i++, incs--) {
 	if (incs == 0)
 	    outcs = mb_len;
-	j = wcrtomb(s + mb_len, instr[i], &mbs);
-	if (j == -1) {
-	    /* invalid char; what to do? */
-	    s[mb_len++] = ZWC('?');
-	    memset(&mbs, 0, sizeof(mbs));
-	} else {
-	    mb_len += j;
+#ifdef __STDC_ISO_10646__
+	if (ZSH_INVALID_WCHAR_TEST(instr[i])) {
+	    s[mb_len++] = ZSH_INVALID_WCHAR_TO_CHAR(instr[i]);
+	} else
+#endif
+	{
+	    j = wcrtomb(s + mb_len, instr[i], &mbs);
+	    if (j == -1) {
+		/* invalid char */
+		s[mb_len++] = ZWC('?');
+		memset(&mbs, 0, sizeof(mbs));
+	    } else {
+		mb_len += j;
+	    }
 	}
     }
     if (incs == 0)
@@ -332,6 +347,13 @@ stringaszleline(char *instr, int incs, int *outll, int *outsz, int *outcs)
 	while (ll > 0) {
 	    size_t cnt = mbrtowc(outptr, inptr, ll, &mbs);
 
+#ifdef __STDC_ISO_10646__
+	    if (cnt == MB_INCOMPLETE || cnt == MB_INVALID) {
+		/* Use private encoding for invalid single byte */
+		*outptr = ZSH_CHAR_TO_INVALID_WCHAR(*inptr);
+		cnt = 1;
+	    }
+#else
 	    /*
 	     * At this point we don't handle either incomplete (-2) or
 	     * invalid (-1) multibyte sequences.  Use the current length
@@ -339,6 +361,7 @@ stringaszleline(char *instr, int incs, int *outll, int *outsz, int *outcs)
 	     */
 	    if (cnt == MB_INCOMPLETE || cnt == MB_INVALID)
 		break;
+#endif
 
 	    if (cnt == 0) {
 		/* Converting '\0' returns 0, but a '\0' is a real