20822: Initial code for Unicode/multibyte input

20823: Debugging test in stat wrong for 64-bit systems
2025-09-08 12:01:21 +02:00 · 2005-02-18 13:57:25 +00:00 · 2005-02-18 13:57:25 +00:00 · d7c13fb2c3
commit d7c13fb2c3
parent 294ef9e872
15 changed files with 479 additions and 162 deletions
--- a/10
+++ b/10
@ -1,5 +1,15 @@
 2005-02-18  Peter Stephenson  <pws@csr.com>

+	* 20823: Src/Modules/stat.c: debugging test failed on 64-bit
+	machines.
+
+	* 20822: system.h, Src/Zle/complist.c, Src/Zle/deltochar.c,
+	Src/Zle/zle.h, Src/Zle/zle_hist.c, Src/Zle/zle_keymap.c,
+	Src/Zle/zle_main.c, Src/Zle/zle_misc.c, Src/Zle/zle_move.c,
+	Src/Zle/zle_thingy.c, Src/Zle/zle_tricky.c, Src/Zle/zle_utils.c,
+	Src/Zle/zle_vi.c: improve input ready for multibyte/Unicode
+	handling, massaging use of getkey() and lastchar.
+
 	* Motoi Washida: users/8522: Completion/Darwin/Command/_defaults:
 	Fix quotation in Darwin completion.

--- a/Src/Modules/stat.c
+++ b/Src/Modules/stat.c
@ -239,7 +239,7 @@ statprint(struct stat *sbuf, char *outbuf, char *fname, int iwhich, int flags)
 #ifdef INO_T_IS_64_BIT
 	convbase(optr, sbuf->st_ino, 0);
 #else
-	DPUTS(sizeof(sbuf->st_ino) > 4,
+	DPUTS(sizeof(sbuf->st_ino) > sizeof(unsigned long),
 	      "Shell compiled with wrong ino_t size");
 	statulprint((unsigned long)sbuf->st_ino, optr);
 #endif
@ -269,7 +269,7 @@ statprint(struct stat *sbuf, char *outbuf, char *fname, int iwhich, int flags)
 #ifdef OFF_T_IS_64_BIT
 	convbase(optr, sbuf->st_size, 0);
 #else
-	DPUTS(sizeof(sbuf->st_size) > 4,
+	DPUTS(sizeof(sbuf->st_size) > sizeof(unsigned long),
 	      "Shell compiled with wrong off_t size");
 	statulprint((unsigned long)sbuf->st_size, optr);
 #endif
--- a/Src/Zle/complist.c
+++ b/Src/Zle/complist.c
@ -1869,6 +1869,10 @@ msearch(Cmatch **ptr, int ins, int back, int rep, int *wrapp)
    msearchpush(ptr, back);

    if (ins) {
+	/*
+	 * TODO: probably need to convert back to multibyte character
+	 * string?  Who knows...
+	 */
        s[0] = lastchar;
        s[1] = '\0';

@ -2802,9 +2806,7 @@ domenuselect(Hookdef dummy, Chdata dat)
                    }
                }
                if (cmd == Th(z_selfinsertunmeta)) {
-                    lastchar &= 0x7f;
-                    if (lastchar == '\r')
-                        lastchar = '\n';
+		    fixunmeta();
                }
                wrap = 0;
                np = msearch(p, ins, (ins ? (mode == MM_BSEARCH) : back),
--- a/Src/Zle/deltochar.c
+++ b/Src/Zle/deltochar.c
@ -37,7 +37,8 @@ static Widget w_zaptochar;
 static int
 deltochar(UNUSED(char **args))
 {
-    int c = getkey(0), dest = zlecs, ok = 0, n = zmult;
+    ZLE_INT_T c = getfullchar(0);
+    int dest = zlecs, ok = 0, n = zmult;
    int zap = (bindk->widget == w_zaptochar);

    if (n > 0) {
--- a/Src/Zle/zle.h
+++ b/Src/Zle/zle.h
@ -27,6 +27,75 @@
 *
 */

+#ifdef ZLE_UNICODE_SUPPORT
+typedef wchar_t ZLE_CHAR_T;
+typedef wchar_t *ZLE_STRING_T;
+typedef int_t   ZLE_INT_T;
+#define ZLE_CHAR_SIZE	sizeof(wchar_t)
+
+/*
+ * MB_CUR_MAX is the maximum number of bytes that a single wide
+ * character will convert into.  We use it to keep strings
+ * sufficiently long.  It should always be defined, but if it isn't
+ * just assume we are using Unicode which requires 6 characters.
+ * (Note that it's not necessarily defined to a constant.)
+ */
+#ifndef MB_CUR_MAX
+#define MB_CUR_MAX 6
+#endif
+
+#define ZLENL	L'\n'
+#define ZLENUL	L'\0'
+#define ZLETAB	L'\t'
+
+#define DIGIT_1		L'1'
+#define DIGIT_9		L'9'
+#define LETTER_a	L'a'
+#define LETTER_z	L'z'
+#define LETTER_A	L'A'
+#define LETTER_Z	L'Z'
+#define LETTER_y	L'y'
+#define LETTER_n	L'n'
+
+#define ZLENULSTR	L""
+#define ZLEEOF	WEOF
+#define ZS_memcpy wmemcpy
+#define ZS_memmove wmemmove
+#define ZC_icntrl iswcntrl
+
+#define LASTFULLCHAR	lastchar_wide
+
+#else  /* Not ZLE_UNICODE_SUPPORT: old single-byte code */
+
+typedef int ZLE_CHAR_T;
+typedef unsigned char *ZLE_STRING_T;
+typedef int ZLE_INT_T;
+#define ZLE_CHAR_SIZE	sizeof(unsigned char)
+
+#define ZLENL	'\n'
+#define ZLENUL	'\0'
+#define ZLETAB	'\t'
+
+#define DIGIT_1		'1'
+#define DIGIT_9		'9'
+#define LETTER_a	'a'
+#define LETTER_z	'z'
+#define LETTER_A	'A'
+#define LETTER_Z	'Z'
+#define LETTER_y	'y'
+#define LETTER_n	'n'
+
+#define ZLENULSTR	""
+#define ZLEEOF	EOF
+#define ZS_memcpy memcpy
+#define ZS_memmove memmove
+#define ZC_icntrl icntrl
+
+#define LASTFULLCHAR	lastchar
+
+#endif
+
+
 typedef struct widget *Widget;
 typedef struct thingy *Thingy;

--- a/Src/Zle/zle_hist.c
+++ b/Src/Zle/zle_hist.c
@ -420,11 +420,12 @@ endofhistory(UNUSED(char **args))
 int
 insertlastword(char **args)
 {
-    int n, nwords, histstep = -1, wordpos = 0, deleteword = 0;
+    int n, nwords, histstep = -1, wordpos = 0, deleteword = 0, len, sz;
    char *s, *t;
    Histent he = NULL;
    LinkList l = NULL;
    LinkNode node;
+    ZLE_STRING_T zs;

    static char *lastinsert;
    static int lasthist, lastpos, lastlen;
@ -554,7 +555,10 @@ insertlastword(char **args)
    memcpy(lastinsert, s, lastlen);
    n = zmult;
    zmult = 1;
-    doinsert(s);
+
+    zs = stringaszleline((unsigned char *)s, &len, &sz);
+    doinsert(zs, len);
+    zfree(zs, sz);
    zmult = n;
    *t = save;
    return 0;
@ -780,7 +784,7 @@ doisearch(char **args, int dir)
 	char *arg;
 	savekeys = kungetct;
 	arg = getkeystring(*args, &len, 2, NULL);
-	ungetkeys(arg, len);
+	ungetbytes(arg, len);
    }

    strcpy(ibuf, ISEARCH_PROMPT);
@ -951,18 +955,23 @@ doisearch(char **args, int dir)
 		sbuf[sbptr] = '^';
 		zrefresh();
 	    }
-	    if ((lastchar = getkey(0)) == EOF)
+	    if (getfullchar(0) == ZLEEOF)
 		feep = 1;
 	    else
 		goto ins;
 	} else {
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
-	    } else if (cmd == Th(z_magicspace))
-		lastchar = ' ';
-	    else if (cmd != Th(z_selfinsert)) {
+		fixunmeta();
+	    } else if (cmd == Th(z_magicspace)) {
+		fixmagicspace();
+	    } else if (cmd == Th(z_selfinsert)) {
+#ifdef ZLE_UNICODE_SUPPORT
+		if (!lastchar_wide_valid)
+		    getfullcharrest(lastchar);
+#else
+		;
+#endif
+	    } else {
 		ungetkeycmd();
 		if (cmd == Th(z_sendbreak))
 		    sbptr = 0;
@ -979,6 +988,8 @@ doisearch(char **args, int dir)
 		sbuf = ibuf + FIRST_SEARCH_CHAR;
 		sibuf *= 2;
 	    }
+	    /* TODO: use lastchar_wide if available, convert back to
+	     * multibyte string.  Yuk.  */
 	    sbuf[sbptr++] = lastchar;
 	}
 	if (feep)
@ -1093,7 +1104,7 @@ getvisrchstr(void)
 	    break;
 	}
 	if(cmd == Th(z_magicspace)) {
-	    lastchar = ' ';
+	    fixmagicspace();
 	    cmd = Th(z_selfinsert);
 	}
 	if(cmd == Th(z_redisplay)) {
@ -1128,15 +1139,20 @@ getvisrchstr(void)
 		sbuf[sptr] = '^';
 		zrefresh();
 	    }
-	    if ((lastchar = getkey(0)) == EOF)
+	    if (getfullchar(0) == ZLEEOF)
 		feep = 1;
 	    else
 		goto ins;
 	} else if(cmd == Th(z_selfinsertunmeta) || cmd == Th(z_selfinsert)) {
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
+		fixunmeta();
+	    } else {
+#ifdef ZLE_UNICODE_SUPPORT
+		if (!lastchar_wide_valid)
+		    getrestchar(lastchar);
+#else
+		;
+#endif
 	    }
 	  ins:
 	    if(sptr == ssbuf - 1) {
@ -1144,6 +1160,7 @@ getvisrchstr(void)
 		strcpy(newbuf, sbuf);
 		statusline = sbuf = newbuf;
 	    }
+	    /* TODO: may be wide char, convert back to multibyte string */
 	    sbuf[sptr++] = lastchar;
 	} else {
 	    feep = 1;
--- a/Src/Zle/zle_keymap.c
+++ b/Src/Zle/zle_keymap.c
@ -1272,7 +1272,21 @@ getkeymapcmd(Keymap km, Thingy *funcp, char **strp)

    keybuflen = 0;
    keybuf[0] = 0;
-    while((lastchar = getkeybuf(!!lastlen)) != EOF) {
+    /*
+     * getkeybuf returns multibyte strings, which may not
+     * yet correspond to complete wide characters, regardless
+     * of the locale.  This is because we can't be sure whether
+     * the key bindings and keyboard input always return such
+     * characters.  So we always look up bindings for each
+     * chunk of string.  Intelligence within self-insert tries
+     * to fix up insertion of real wide characters properly.
+     *
+     * Note that this does not stop the user binding wide characters to
+     * arbitrary functions, just so long as the string used in the
+     * argument to bindkey is in the correct form for the locale.
+     * That's beyond our control.
+     */
+    while(getkeybuf(!!lastlen) != EOF) {
 	char *s;
 	Thingy f;
 	int loc = 1;
@ -1296,7 +1310,7 @@ getkeymapcmd(Keymap km, Thingy *funcp, char **strp)
 	lastchar = lastc;
    if(lastlen != keybuflen) {
 	unmetafy(keybuf + lastlen, &keybuflen);
-	ungetkeys(keybuf+lastlen, keybuflen);
+	ungetbytes(keybuf+lastlen, keybuflen);
 	if(vichgflag)
 	    vichgbufptr -= keybuflen;
 	keybuf[lastlen] = 0;
@ -1306,11 +1320,24 @@ getkeymapcmd(Keymap km, Thingy *funcp, char **strp)
    return keybuf;
 }

+/*
+ * Add a (possibly metafied) byte to the key input so far.
+ * This handles individual bytes of a multibyte string separately;
+ * see note in getkeymapcmd.  Hence there is no wide character
+ * support at this level.
+ *
+ * TODO: Need to be careful about whether we return EOF in the
+ * middle of a wide character.  However, I think we're OK since
+ * EOF and 0xff are distinct and we're reading bytes from the
+ * lower level, so EOF really does mean something went wrong.  Even so,
+ * I'm worried enough to leave this note here for now.
+ */
+
 /**/
 static int
 getkeybuf(int w)
 {
-    int c = getkey(w);
+    int c = getbyte(w);

    if(c < 0)
 	return EOF;
@ -1332,7 +1359,7 @@ getkeybuf(int w)
 mod_export void
 ungetkeycmd(void)
 {
-    ungetkeys(keybuf, keybuflen);
+    ungetbytes(keybuf, keybuflen);
 }

 /* read a command from the current keymap, with widgets */
@ -1359,7 +1386,7 @@ getkeycmd(void)
 	    return NULL;
 	}
 	pb = unmetafy(ztrdup(str), &len);
-	ungetkeys(pb, len);
+	ungetbytes(pb, len);
 	zfree(pb, strlen(str) + 1);
 	goto sentstring;
    }
--- a/Src/Zle/zle_main.c
+++ b/Src/Zle/zle_main.c
@ -78,10 +78,30 @@ int done;
 /**/
 int mark;

-/* last character pressed */
+/*
+ * Last character pressed.
+ *
+ * Depending how far we are with processing, the lastcharacter may
+ * be a single byte read (lastchar_wide_valid is 0, lastchar_wide is not
+ * valid) or a full wide character.  This is needed because we can't be
+ * sure whether the user is typing old \M-style commands or multibyte
+ * input.
+ *
+ * Calling getfullchar or getrestchar is guaranteed to ensure we have
+ * a valid wide character (although this may be WEOF).  In many states
+ * we know this and don't need to test lastchar_wide_valid.
+ */

 /**/
-mod_export int lastchar;
+mod_export int
+lastchar;
+#ifdef ZLE_UNICODE_SUPPORT
+/**/
+mod_export ZLE_INT_T lastchar_wide;
+/**/
+mod_export int
+lastchar_wide_valid;
+#endif

 /* the bindings for the previous and for this key */

@ -148,7 +168,7 @@ mod_export struct modifier zmod;
 /**/
 int prefixflag;

-/* Number of characters waiting to be read by the ungetkeys mechanism */
+/* Number of characters waiting to be read by the ungetbytes mechanism */
 /**/
 int kungetct;

@ -196,7 +216,7 @@ zsetterm(void)
 	 * we can't set up the terminal for zle *at all* until
 	 * we are sure there is no more typeahead to come.  So
 	 * if there is typeahead, we set the flag delayzsetterm.
-	 * Then getkey() performs another FIONREAD call; if that is
+	 * Then getbyte() performs another FIONREAD call; if that is
 	 * 0, we have finally used up all the typeahead, and it is
 	 * safe to alter the terminal, which we do at that point.
 	 */
@ -266,7 +286,7 @@ zsetterm(void)
    ti.tio.c_cc[VMIN] = 1;
    ti.tio.c_cc[VTIME] = 0;
    ti.tio.c_iflag |= (INLCR | ICRNL);
- /* this line exchanges \n and \r; it's changed back in getkey
+ /* this line exchanges \n and \r; it's changed back in getbyte
 	so that the net effect is no change at all inside the shell.
 	This double swap is to allow typeahead in common cases, eg.

@ -275,12 +295,12 @@ zsetterm(void)
 	echo foo<return>  <--- typed before sleep returns

 	The shell sees \n instead of \r, since it was changed by the kernel
-	while zsh wasn't looking. Then in getkey() \n is changed back to \r,
+	while zsh wasn't looking. Then in getbyte() \n is changed back to \r,
 	and it sees "echo foo<accept line>", as expected. Without the double
 	swap the shell would see "echo foo\n", which is translated to
 	"echo fooecho foo<accept line>" because of the binding.
 	Note that if you type <line-feed> during the sleep the shell just sees
-	\n, which is translated to \r in getkey(), and you just get another
+	\n, which is translated to \r in getbyte(), and you just get another
 	prompt. For type-ahead to work in ALL cases you have to use
 	stty inlcr.

@ -321,9 +341,16 @@ zsetterm(void)
 static char *kungetbuf;
 static int kungetsz;

+/*
+ * Note on ungetbyte and ungetbytes for the confused (pws):
+ * these are low level and deal with bytes before they
+ * have been converted into (possibly wide) characters.
+ * Hence the names.
+ */
+
 /**/
 void
-ungetkey(int ch)
+ungetbyte(int ch)
 {
    if (kungetct == kungetsz)
 	kungetbuf = realloc(kungetbuf, kungetsz *= 2);
@ -332,11 +359,11 @@ ungetkey(int ch)

 /**/
 void
-ungetkeys(char *s, int len)
+ungetbytes(char *s, int len)
 {
    s += len;
    while (len--)
-	ungetkey(*--s);
+	ungetbyte(*--s);
 }

 #if defined(pyr) && defined(HAVE_SELECT)
@ -356,7 +383,7 @@ breakread(int fd, char *buf, int n)
 #endif

 static int
-raw_getkey(int keytmout, char *cptr)
+raw_getbyte(int keytmout, char *cptr)
 {
    long exp100ths;
    int ret;
@ -591,13 +618,22 @@ raw_getkey(int keytmout, char *cptr)

 /**/
 mod_export int
-getkey(int keytmout)
+getbyte(int keytmout)
 {
    char cc;
    unsigned int ret;
    int die = 0, r, icnt = 0;
    int old_errno = errno, obreaks = breaks;

+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * Reading a single byte always invalidates the status
+     * of lastchar_wide.  We may fix this up in getrestchar
+     * if this is the last byte of a wide character.
+     */
+    lastchar_wide_valid = 0;
+#endif
+
    if (kungetct)
 	ret = STOUC(kungetbuf[--kungetct]);
    else {
@ -612,10 +648,10 @@ getkey(int keytmout)
 	for (;;) {
 	    int q = queue_signal_level();
 	    dont_queue_signals();
-	    r = raw_getkey(keytmout, &cc);
+	    r = raw_getbyte(keytmout, &cc);
 	    restore_queue_signals(q);
 	    if (r == -2)	/* timeout */
-		return EOF;
+		return lastchar = EOF;
 	    if (r == 1)
 		break;
 	    if (r == 0) {
@ -642,7 +678,7 @@ getkey(int keytmout)
 		errflag = 0;
 		breaks = obreaks;
 		errno = old_errno;
-		return EOF;
+		return lastchar = EOF;
 	    } else if (errno == EWOULDBLOCK) {
 		fcntl(0, F_SETFL, 0);
 	    } else if (errno == EIO && !die) {
@ -665,15 +701,96 @@ getkey(int keytmout)

 	ret = STOUC(cc);
    }
+    /*
+     * TODO: if vichgbuf is to be characters instead of a multibyte
+     * string the following needs moving to getfullchar().
+     */
    if (vichgflag) {
 	if (vichgbufptr == vichgbufsz)
 	    vichgbuf = realloc(vichgbuf, vichgbufsz *= 2);
 	vichgbuf[vichgbufptr++] = ret;
    }
    errno = old_errno;
-    return ret;
+    return lastchar = ret;
 }

+
+/*
+ * Get a full character rather than just a single byte.
+ * (TODO: Strictly we ought to call this getbyte and the above
+ * function getbyte.)
+ */
+
+/**/
+mod_export ZLE_INT_T
+getfullchar(int keytmout)
+{
+    int inchar = getbyte(keytmout);
+
+#ifdef ZLE_UNICODE_SUPPORT
+    return getrestchar(inchar);
+#else
+    return inchar;
+#endif
+}
+
+
+/**/
+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * Get the remainder of a character if we support multibyte
+ * input strings.  It may not require any more input, but
+ * we haven't yet checked.  The character previously returned
+ * by getbyte() is passed down as inchar.
+ */
+
+/**/
+mod_export ZLE_INT_T
+getrestchar(int inchar)
+{
+    char cnull = '\0';
+    char buf[MB_CUR_MAX], *ptr;
+    wchar_t outchar;
+    int ret;
+
+    /*
+     * We are guaranteed to set a valid wide last character,
+     * although it may be WEOF (which is technically not
+     * a wide character at all...)
+     */
+    lastchar_wide_valid = 1;
+
+    if (inchar == EOF)
+	return lastchar_wide = WEOF;
+
+    /* reset shift state by converting null */
+    mbrtowc(&outchar, &cnull, 1, &ps);
+
+    ptr = buf;
+    *ptr++ = inchar;
+    /*
+     * Return may be zero if we have a NULL; handle this like
+     * any other character.
+     */
+    while ((ret = mbrtowc(&outchar, buf, ptr - buf, &ps)) < 0) {
+	if (ret == -1) {
+	    /*
+	     * Invalid input.  Hmm, what's the right thing to do here?
+	     */
+	    return lastchar_wide = WEOF;
+	}
+	/* No timeout here as we really need the character. */
+	inchar = getbyte(0);
+	if (inchar == EOF)
+	    return lastchar_wide = WEOF;
+	*ptr++ = inchar;
+    }
+    return lastchar_wide = (wint_t)outchar;
+}
+/**/
+#endif
+
+
 /**/
 void
 zlecore(void)
@ -1445,7 +1562,7 @@ setup_(UNUSED(Module m))
    zlereadptr = zleread;
    zlesetkeymapptr = zlesetkeymap;

-    getkeyptr = getkey;
+    getkeyptr = getbyte;

    /* initialise the thingies */
    init_thingies();
--- a/Src/Zle/zle_misc.c
+++ b/Src/Zle/zle_misc.c
@ -34,13 +34,13 @@

 /**/
 void
-doinsert(char *str)
+doinsert(ZLE_STRING_T zstr, int len)
 {
-    char *s;
-    int len = ztrlen(str);
-    int c1 = *str == Meta ? STOUC(str[1])^32 : STOUC(*str);/* first character */
+    ZLE_STRING_T s;
+    ZLE_CHAR_T c1 = *zstr;	     /* first character */
    int neg = zmult < 0;             /* insert *after* the cursor? */
    int m = neg ? -zmult : zmult;    /* number of copies to insert */
+    int count;

    iremovesuffix(c1, 0);
    invalidatelist();
@ -50,8 +50,8 @@ doinsert(char *str)
    else if(zlecs + m * len > zlell)
 	spaceinline(zlecs + m * len - zlell);
    while(m--)
-	for(s = str; *s; s++)
-	    zleline[zlecs++] = *s == Meta ? *++s ^ 32 : *s;
+	for(s = zstr, count = len; count; s++, count--)
+	    zleline[zlecs++] = *s;
    if(neg)
 	zlecs += zmult * len;
 }
@ -60,25 +60,41 @@ doinsert(char *str)
 mod_export int
 selfinsert(UNUSED(char **args))
 {
-    char s[3], *p = s;
-
-    if(imeta(lastchar)) {
-	*p++ = Meta;
-	lastchar ^= 32;
-    }
-    *p++ = lastchar;
-    *p = 0;
-    doinsert(s);
+#ifdef ZLE_UNICODE_SUPPORT
+    if (!lastchar_wide_valid)
+	getrestchar(lastchar);
+    doinsert(&lastchar_wide, 1);
+#else
+    char s = lastchar;
+    doinsert(&s, 1);
+#endif
    return 0;
 }

+/**/
+mod_export void
+fixunmeta(void)
+{
+    lastchar &= 0x7f;
+    if (lastchar == '\r')
+	lastchar = '\n';
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * TODO: can we do this better?
+     * We need a wide character to insert.
+     * selfinsertunmeta is intrinsically problematic
+     * with multibyte input.
+     */
+    lastchar_wide = (ZLE_CHAR_T)lastchar;
+    lastchar_wide_valid = TRUE;
+#endif
+}
+
 /**/
 mod_export int
 selfinsertunmeta(char **args)
 {
-    lastchar &= 0x7f;
-    if (lastchar == '\r')
-	lastchar = '\n';
+    fixunmeta();
    return selfinsert(args);
 }

@ -490,11 +506,11 @@ quotedinsert(char **args)
    sob.sg_flags = (sob.sg_flags | RAW) & ~ECHO;
    ioctl(SHTTY, TIOCSETN, &sob);
 #endif
-    lastchar = getkey(0);
+    getfullchar(0);
 #ifndef HAS_TIO
    zsetterm();
 #endif
-    if (lastchar < 0)
+    if (LASTFULLCHAR == ZLEEOF)
 	return 1;
    else
 	return selfinsert(args);
@ -506,9 +522,20 @@ digitargument(UNUSED(char **args))
 {
    int sign = (zmult < 0) ? -1 : 1;

+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * It's too dangerous to allow metafied input.  See
+     * universalargument for comments on (possibly suboptimal) handling
+     * of digits.  We are assuming ASCII is a subset of the multibyte
+     * encoding.
+     */
+    if (lastchar < '0' || lastchar > '9')
+	return 1;
+#else
    /* allow metafied as well as ordinary digits */
    if ((lastchar & 0x7f) < '0' || (lastchar & 0x7f) > '9')
 	return 1;
+#endif

    if (!(zmod.flags & MOD_TMULT))
 	zmod.tmult = 0;
@ -546,7 +573,22 @@ universalargument(char **args)
 	zmod.flags |= MOD_MULT;
 	return 0;
    }
-    while ((gotk = getkey(0)) != EOF) {
+    /*
+     * TODO: this is quite tricky to do when trying to maintain
+     * compatibility between the old input system and Unicode.
+     * We don't know what follows the digits, so if we try to
+     * read wide characters we may fail (e.g. we may come across an old
+     * \M-style binding).
+     *
+     * If we assume individual bytes are either explicitly ASCII or
+     * not (a la UTF-8), we get away with it; we can back up individual
+     * bytes and everything will work.  We may want to relax this
+     * assumption later.  ("Much later" - (C) Steven Singer,
+     * CSR BlueCore firmware, ca. 2000.)
+     *
+     * Hence for now this remains byte-by-byte.
+     */
+    while ((gotk = getbyte(0)) != EOF) {
 	if (gotk == '-' && !digcnt) {
 	    minus = -1;
 	    digcnt++;
@ -554,7 +596,7 @@ universalargument(char **args)
 	    pref = pref * 10 + (gotk & 0xf);
 	    digcnt++;
 	} else {
-	    ungetkey(gotk);
+	    ungetbyte(gotk);
 	    break;
 	}
    }
@ -765,24 +807,32 @@ executenamedcommand(char *prmt)
 	} else if(cmd == Th(z_viquotedinsert)) {
 	    *ptr = '^';
 	    zrefresh();
-	    lastchar = getkey(0);
-	    if(lastchar == EOF || !lastchar || len == NAMLEN)
+	    getfullchar(0);
+	    if(LASTFULLCHAR == ZLEEOF || !LASTFULLCHAR || len == NAMLEN)
 		feep = 1;
-	    else
+	    else {
+		/* TODO: convert back to multibyte string */
 		*ptr++ = lastchar, len++, curlist = 0;
+	    }
 	} else if(cmd == Th(z_quotedinsert)) {
-	    if((lastchar = getkey(0)) == EOF || !lastchar || len == NAMLEN)
+	    if(getfullchar(0) == ZLEEOF ||
+	       !LASTFULLCHAR || len == NAMLEN)
 		feep = 1;
-	    else
+	    else {
+		/* TODO: convert back to multibyte string */
 		*ptr++ = lastchar, len++, curlist = 0;
+	    }
 	} else if(cmd == Th(z_backwarddeletechar) ||
 	    	cmd == Th(z_vibackwarddeletechar)) {
-	    if (len)
+	    if (len) {
+		/* TODO: backward full character in multibyte string. Yuk. */
 		len--, ptr--, curlist = 0;
+	    }
 	} else if(cmd == Th(z_killregion) || cmd == Th(z_backwardkillword) ||
 		  cmd == Th(z_vibackwardkillword)) {
 	    if (len)
 		curlist = 0;
+	    /* TODO: backward full character in multibyte string. Yuk. */
 	    while (len && (len--, *--ptr != '-'));
 	} else if(cmd == Th(z_killwholeline) || cmd == Th(z_vikillline) ||
 	    	cmd == Th(z_backwardkillline)) {
@ -812,9 +862,7 @@ executenamedcommand(char *prmt)
 		unrefthingy(r);
 	    }
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
+		fixunmeta();
 		cmd = Th(z_selfinsert);
 	    }
 	    if (cmd == Th(z_listchoices) || cmd == Th(z_deletecharorlist) ||
@ -867,11 +915,24 @@ executenamedcommand(char *prmt)
 		    len = cmdambig;
 		}
 	    } else {
-		if (len == NAMLEN || icntrl(lastchar) ||
-		    cmd != Th(z_selfinsert))
+		if (len == NAMLEN || cmd != Th(z_selfinsert))
 		    feep = 1;
-		else
-		    *ptr++ = lastchar, len++, curlist = 0;
+		else {
+#ifdef ZLE_UNICODE_SUPPORT
+		    if (!lastchar_wide_valid)
+			getrestchar(0);
+		    if (iswcntrl(lastchar))
+#else
+		    if (icntrl(lastchar))
+#endif
+		    {
+			feep = 1;
+		    }
+		    else {
+			/* TODO: convert back to multibyte string */
+			*ptr++ = lastchar, len++, curlist = 0;
+		    }
+		}
 	    }
 	}
 	if (feep)
@ -911,6 +972,9 @@ executenamedcommand(char *prmt)
 /* Length of suffix to remove when inserting each possible character value.  *
 * suffixlen[256] is the length to remove for non-insertion editing actions. */

+/*
+ * TODO: Aargh, this is completely broken with wide characters.
+ */
 /**/
 mod_export int suffixlen[257];

@ -1000,7 +1064,7 @@ makesuffixstr(char *f, char *s, int n)

 /**/
 mod_export void
-iremovesuffix(int c, int keep)
+iremovesuffix(ZLE_CHAR_T c, int keep)
 {
    if (suffixfunc) {
 	Eprog prog = getshfunc(suffixfunc);
@ -1024,7 +1088,12 @@ iremovesuffix(int c, int keep)
 	zsfree(suffixfunc);
 	suffixfunc = NULL;
    } else {
+#ifdef ZLE_UNICODE_SUPPORT
+	/* TODO: best I can think of for now... */
+	int sl = (unsigned int)c < 256 ? suffixlen[c] : 0;
+#else
 	int sl = suffixlen[c];
+#endif
 	if(sl) {
 	    backdel(sl);
 	    if (!keep)
--- a/Src/Zle/zle_move.c
+++ b/Src/Zle/zle_move.c
@ -353,13 +353,14 @@ vibeginningofline(UNUSED(char **args))
    return 0;
 }

-static int vfindchar, vfinddir, tailadd;
+static ZLE_INT_T vfindchar;
+static int vfinddir, tailadd;

 /**/
 int
 vifindnextchar(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = 1;
 	tailadd = 0;
 	return virepeatfind(args);
@ -371,7 +372,7 @@ vifindnextchar(char **args)
 int
 vifindprevchar(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = -1;
 	tailadd = 0;
 	return virepeatfind(args);
@ -383,7 +384,7 @@ vifindprevchar(char **args)
 int
 vifindnextcharskip(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = 1;
 	tailadd = -1;
 	return virepeatfind(args);
@ -395,7 +396,7 @@ vifindnextcharskip(char **args)
 int
 vifindprevcharskip(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = -1;
 	tailadd = 1;
 	return virepeatfind(args);
@ -465,12 +466,12 @@ vifirstnonblank(UNUSED(char **args))
 int
 visetmark(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;

-    ch = getkey(0);
-    if (ch < 'a' || ch > 'z')
+    ch = getfullchar(0);
+    if (ch < LETTER_a || ch > LETTER_z)
 	return 1;
-    ch -= 'a';
+    ch -= LETTER_a;
    vimarkcs[ch] = zlecs;
    vimarkline[ch] = histline;
    return 0;
@ -480,15 +481,15 @@ visetmark(UNUSED(char **args))
 int
 vigotomark(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;

-    ch = getkey(0);
-    if (ch == lastchar)
+    ch = getfullchar(0);
+    if (ch == LASTFULLCHAR)
 	ch = 26;
    else {
-	if (ch < 'a' || ch > 'z')
+	if (ch < LETTER_a || ch > LETTER_z)
 	    return 1;
-	ch -= 'a';
+	ch -= LETTER_a;
    }
    if (!vimarkline[ch])
 	return 1;
--- a/Src/Zle/zle_thingy.c
+++ b/Src/Zle/zle_thingy.c
@ -473,7 +473,7 @@ bin_zle_unget(char *name, char **args, UNUSED(Options ops), UNUSED(char func))
 	return 1;
    }
    while (p > b)
-	ungetkey((int) *--p);
+	ungetbyte((int) *--p);
    return 0;
 }

--- a/Src/Zle/zle_tricky.c
+++ b/Src/Zle/zle_tricky.c
@ -2297,13 +2297,28 @@ doexpandhist(void)
    return 0;
 }

+/**/
+void
+fixmagicspace(void)
+{
+    lastchar = ' ';
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * This is redundant if the multibyte encoding extends ASCII,
+     * since lastchar is a full character, but it's safer anyway...
+     */
+    lastchar_wide = L' ';
+    lastchar_wide_valid = TRUE;
+#endif
+}
+
 /**/
 int
 magicspace(char **args)
 {
    char *bangq;
    int ret;
-    lastchar = ' ';
+    fixmagicspace();
    for (bangq = (char *)zleline; (bangq = strchr(bangq, bangchar));
 	 bangq += 2)
 	if (bangq[1] == '"' && (bangq == (char *)zleline || bangq[-1] != '\\'))
--- a/Src/Zle/zle_utils.c
+++ b/Src/Zle/zle_utils.c
@ -510,7 +510,7 @@ hstrnstr(char *haystack, int pos, char *needle, int len, int dir, int sens)
 mod_export int
 getzlequery(int yesno)
 {
-    int c;
+    ZLE_INT_T c;
 #ifdef FIONREAD
    int val;

@ -525,18 +525,18 @@ getzlequery(int yesno)
 #endif

    /* get a character from the tty and interpret it */
-    c = getkey(0);
+    c = getfullchar(0);
    if (yesno) {
-	if (c == '\t')
-	    c = 'y';
+	if (c == ZLETAB)
+	    c = LETTER_y;
 	else if (icntrl(c) || c == EOF)
-	    c = 'n';
+	    c = LETTER_n;
 	else
 	    c = tulower(c);
    }
    /* echo response and return */
-    if (c != '\n')
-	putc(c, shout);
+    if (c != ZLENL)
+	putc(c, shout);		/* TODO: convert to multibyte */
    return c;
 }

--- a/Src/Zle/zle_vi.c
+++ b/Src/Zle/zle_vi.c
@ -50,6 +50,11 @@ int vilinerange;
 /**/
 int vichgbufsz, vichgbufptr, vichgflag;

+/*
+ * TODO: need consistent handling of vichgbuf: ZLE_STRING_T or
+ * char *?  Consequently, use of lastchar in this file needs fixing
+ * too.
+ */
 /**/
 char *vichgbuf;

@ -95,15 +100,15 @@ startvitext(int im)
 }

 /**/
-int
+ZLE_INT_T
 vigetkey(void)
 {
    Keymap mn = openkeymap("main");
    char m[3], *str;
    Thingy cmd;

-    if((lastchar = getkey(0)) == EOF)
-	return -1;
+    if(getbyte(0) == EOF)
+	return ZLEEOF;

    m[0] = lastchar;
    metafy(m, 1, META_NOALLOC);
@ -112,23 +117,35 @@ vigetkey(void)
    else
 	cmd = t_undefinedkey;

+    /*
+     * TODO: if this was bound to self-insert, we may
+     * be on the first character of a multibyte string
+     * and need to acquire the rest.
+     */
    if (!cmd || cmd == Th(z_sendbreak)) {
-	return -1;
+	return ZLEEOF;
    } else if (cmd == Th(z_quotedinsert)) {
-	if ((lastchar = getkey(0)) == EOF)
-	    return -1;
+	if (getfullchar(0) == ZLEEOF)
+	    return ZLEEOF;
    } else if(cmd == Th(z_viquotedinsert)) {
-	char sav = zleline[zlecs];
+	ZLE_CHAR_T sav = zleline[zlecs];

 	zleline[zlecs] = '^';
 	zrefresh();
-	lastchar = getkey(0);
+	getfullchar(0);
 	zleline[zlecs] = sav;
-	if(lastchar == EOF)
-	    return -1;
-    } else if (cmd == Th(z_vicmdmode))
-	return -1;
-    return lastchar;
+	if(LASTFULLCHAR == ZLEEOF)
+	    return ZLEEOF;
+    } else if (cmd == Th(z_vicmdmode)) {
+	return ZLEEOF;
+    }
+#ifdef ZLE_UNICODE_SUPPORT
+    if (!lastchar_wide_valid)
+    {
+	getrestchar(lastchar);
+    }
+#endif
+    return LASTFULLCHAR;
 }

 /**/
@ -489,7 +506,7 @@ vireplacechars(UNUSED(char **args))
 	return 1;
    }
    /* get key */
-    if((ch = vigetkey()) == -1) {
+    if((ch = vigetkey()) == ZLEEOF) {
 	vichgflag = 0;
 	return 1;
    }
@ -593,7 +610,7 @@ virepeatchange(UNUSED(char **args))
    }
    /* repeat the command */
    inrepeat = 1;
-    ungetkeys(vichgbuf, vichgbufptr);
+    ungetbytes(vichgbuf, vichgbufptr);
    return 0;
 }

@ -817,26 +834,35 @@ vicapslockpanic(UNUSED(char **args))
    statusline = "press a lowercase key to continue";
    statusll = strlen(statusline);
    zrefresh();
-    while (!islower(getkey(0)));
+#ifdef ZLE_UNICODE_SUPPORT
+    while (!iswlower(getfullchar(0)));
+#else
+    while (!islower(getfullchar(0)));
+#endif
    statusline = NULL;
    return 0;
 }

+#ifdef ZLE_UNICODE_SUPPORT
+#else
+#endif
+
 /**/
 int
 visetbuffer(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;

    if ((zmod.flags & MOD_VIBUF) ||
-	(((ch = getkey(0)) < '1' || ch > '9') &&
-	 (ch < 'a' || ch > 'z') && (ch < 'A' || ch > 'Z')))
+	(((ch = getfullchar(0)) < DIGIT_1 || ch > DIGIT_9) &&
+	 (ch < LETTER_a || ch > LETTER_z) &&
+	 (ch < LETTER_A || ch > LETTER_Z)))
 	return 1;
-    if (ch >= 'A' && ch <= 'Z')	/* needed in cut() */
+    if (ch >= LETTER_A && ch <= LETTER_Z)	/* needed in cut() */
 	zmod.flags |= MOD_VIAPP;
    else
 	zmod.flags &= ~MOD_VIAPP;
-    zmod.vibuf = tulower(ch) + (idigit(ch) ? -'1' + 26 : -'a');
+    zmod.vibuf = tulower(ch) + (idigit(ch) ? - DIGIT_1 + 26 : -LETTER_a);
    zmod.flags |= MOD_VIBUF;
    prefixflag = 1;
    return 0;
@ -897,12 +923,12 @@ viquotedinsert(char **args)
    sob.sg_flags = (sob.sg_flags | RAW) & ~ECHO;
    ioctl(SHTTY, TIOCSETN, &sob);
 #endif
-    lastchar = getkey(0);
+    getfullchar(0);
 #ifndef HAS_TIO
    zsetterm();
 #endif
    foredel(1);
-    if(lastchar < 0)
+    if(LASTFULLCHAR == ZLEEOF)
 	return 1;
    else
 	return selfinsert(args);
--- a/Src/system.h
+++ b/Src/system.h
@ -705,40 +705,3 @@ extern short ospeed;
 #   endif
 # endif
 #endif
-
-#ifdef ZLE_UNICODE_SUPPORT
-typedef wchar_t ZLE_CHAR_T;
-typedef wchar_t *ZLE_STRING_T;
-#define ZLE_CHAR_SIZE	sizeof(wchar_t)
-
-/*
- * MB_CUR_MAX is the maximum number of bytes that a single wide
- * character will convert into.  We use it to keep strings
- * sufficiently long.  It should always be defined, but if it isn't
- * just assume we are using Unicode which requires 6 characters.
- * (Note that it's not necessarily defined to a constant.)
- */
-#ifndef MB_CUR_MAX
-#define MB_CUR_MAX 6
-#endif
-
-#define ZLENL	L'\n'
-#define ZLENUL	L'\0'
-#define ZLETAB	L'\t'
-#define ZLENULSTR	L""
-#define ZS_memcpy wmemcpy
-#define ZS_memmove wmemmove
-#define ZC_icntrl iswcntrl
-#else
-typedef int ZLE_CHAR_T;
-typedef unsigned char *ZLE_STRING_T;
-#define ZLE_CHAR_SIZE	sizeof(unsigned char)
-
-#define ZLENL	'\n'
-#define ZLENUL	'\0'
-#define ZLETAB	'\t'
-#define ZLENULSTR	""
-#define ZS_memcpy memcpy
-#define ZS_memmove memmove
-#define ZC_icntrl icntrl
-#endif