1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2025-05-18 21:51:02 +02:00

52169: a few more improvements of (#) flag

fix (#X) in C locale in FreeBSD, DragonFly, NetBSD.
Negative values such as ${(#X):--1} are now error.
UCS4 is limited to < 0x8000_0000 (in OSes without __STDC_ISO_10646__).
This commit is contained in:
Jun-ichi Takimoto 2023-09-27 01:56:47 +09:00
parent e4e9afe373
commit 02e33c54d8
3 changed files with 126 additions and 125 deletions

View file

@ -1,3 +1,7 @@
2023-09-27 Jun-ichi Takimoto <takimoto-j@kba.biglobe.ne.jp>
* 52169: Src/subst.c, Src/utils.c: a few more improvemets of (#)
2023-09-23 Bart Schaefer <schaefer@BartMAC2014.local>
* 52180: Doc/Zsh/expn.yo: clarify array behavior of ${|var|...}

View file

@ -1501,16 +1501,15 @@ substevalchar(char *ptr)
return noerrs ? dupstring(""): NULL;
}
errflag |= saved_errflag;
#ifdef MULTIBYTE_SUPPORT
if (isset(MULTIBYTE) && ires > 127) {
/* '\\' + 'U' + 8 bytes of character + '\0' */
char buf[11];
/* inefficient: should separate out \U handling from getkeystring */
sprintf(buf, "\\U%.8x", (unsigned int)ires & 0xFFFFFFFFu);
ptr = getkeystring(buf, &len, GETKEYS_BINDKEY, NULL);
if (ires < 0) {
zerr("character not in range");
}
if (len == 0)
#ifdef MULTIBYTE_SUPPORT
else if (isset(MULTIBYTE) && ires > 127) {
ptr = zhalloc(MB_CUR_MAX);
len = ucs4tomb((unsigned int)ires & 0xffffffff, ptr);
}
if (len <= 0)
#endif
{
ptr = zhalloc(2);

View file

@ -6672,11 +6672,14 @@ dquotedzputs(char const *s, FILE *stream)
# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
/* Convert a character from UCS4 encoding to UTF-8 */
static size_t
static int
ucs4toutf8(char *dest, unsigned int wval)
{
size_t len;
int len;
/* UCS4 is now equvalent to UTF-32 and limited to 0 - 0x10_FFFF.
* This function accepts 0 - 0x7FFF_FFFF (old range of UCS4) to be
* compatible with wctomb(3) (in UTF-8 locale) on Linux. */
if (wval < 0x80)
len = 1;
else if (wval < 0x800)
@ -6687,8 +6690,12 @@ ucs4toutf8(char *dest, unsigned int wval)
len = 4;
else if (wval < 0x4000000)
len = 5;
else
else if (wval < 0x80000000)
len = 6;
else {
zerr("character not in range");
return -1;
}
switch (len) { /* falls through except to the last case */
case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
@ -6705,30 +6712,89 @@ ucs4toutf8(char *dest, unsigned int wval)
}
#endif
/* Convert UCS4 to a multibyte character in current locale.
* Result is saved in buf (must be at least MB_CUR_MAX bytes long).
* Returns the number of bytes saved in buf, or -1 if conversion fails. */
/*
* The following only occurs once or twice in the code, but in different
* places depending how character set conversion is implemented.
*/
#define CHARSET_FAILED() \
if (how & GETKEY_DOLLAR_QUOTE) { \
while ((*tdest++ = *++s)) { \
if (how & GETKEY_UPDATE_OFFSET) { \
if (s - sstart > *misc) \
(*misc)++; \
} \
if (*s == Snull) { \
*len = (s - sstart) + 1; \
*tdest = '\0'; \
return buf; \
} \
} \
*len = tdest - buf; \
return buf; \
} \
*t = '\0'; \
*len = t - buf; \
return buf
/**/
int
ucs4tomb(unsigned int wval, char *buf)
{
#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
int count = wctomb(buf, (wchar_t)wval);
if (count == -1)
zerr("character not in range");
return count;
#else /* !(HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__) */
# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
return ucs4toutf8(buf, wval);
} else {
# ifdef HAVE_ICONV
iconv_t cd;
char inbuf[4], *bsave = buf;
ICONV_CONST char *inptr = inbuf;
size_t inbytes = 4, outbytes = 6;
const char *codesetstr = nl_langinfo(CODESET);
size_t count;
int i;
/*
* If the code set isn't handled, we'd better assume it's US-ASCII
* rather than just failing hopelessly. Solaris has a weird habit
* of returning 646. This is handled by the native iconv(), but
* not by GNU iconv; what's more, some versions of the native iconv
* don't handle standard names like ASCII.
*
* This should only be a problem if there's a mismatch between the
* NLS and the iconv in use, which probably only means if libiconv
* is in use. We checked at configure time if our libraries pulled
* in _libiconv_version, which should be a good test.
*
* It shouldn't ever be NULL, but while we're being paranoid...
*/
# ifdef ICONV_FROM_LIBICONV
if (!codesetstr || !*codesetstr)
codesetstr = "US-ASCII";
# endif
cd = iconv_open(codesetstr, "UCS-4BE");
# ifdef ICONV_FROM_LIBICONV
if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
codesetstr = "US-ASCII";
cd = iconv_open(codesetstr, "UCS-4BE");
}
# endif
if (cd == (iconv_t)-1) {
zerr("cannot do charset conversion (iconv failed)");
return -1;
}
/* store value in big endian form */
for (i=3; i>=0; i--) {
inbuf[i] = wval & 0xff;
wval >>= 8;
}
count = iconv(cd, &inptr, &inbytes, &buf, &outbytes);
iconv_close(cd);
if (count) {
/* -1 indicates error. Positive value means number of "invalid"
* (or "non-reversible") conversions, which we consider as
* "out-of-range" characters. */
zerr("character not in range");
return -1;
}
return buf - bsave;
# else /* !HAVE_ICONV */
zerr("cannot do charset conversion (iconv not available)");
return -1;
# endif /* HAVE_ICONV */
}
# else /* !(HAVE_NL_LANGINFO && CODESET) */
zerr("cannot do charset conversion (NLS not supported)");
return -1;
# endif /* HAVE_NL_LANGINFO && CODESET */
#endif /* HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__ */
}
/*
* Decode a key string, turning it into the literal characters.
@ -6785,21 +6851,6 @@ getkeystring(char *s, int *len, int how, int *misc)
char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf = NULL;
char svchar = '\0';
int meta = 0, control = 0, ignoring = 0;
int i;
#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
wint_t wval;
int count;
#else
unsigned int wval;
# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
# if defined(HAVE_ICONV)
iconv_t cd;
char inbuf[4];
size_t inbytes, outbytes;
# endif
size_t count;
# endif
#endif
DPUTS((how & GETKEY_UPDATE_OFFSET) &&
(how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)),
@ -6864,7 +6915,8 @@ getkeystring(char *s, int *len, int how, int *misc)
}
for (; *s; s++) {
if (*s == '\\' && s[1]) {
int miscadded;
int miscadded, count, i;
unsigned int wval;
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
(*misc)--;
miscadded = 1;
@ -6979,86 +7031,32 @@ getkeystring(char *s, int *len, int how, int *misc)
*misc = wval;
return s+1;
}
#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
count = wctomb(t, (wchar_t)wval);
count = ucs4tomb(wval, t);
if (count == -1) {
zerr("character not in range");
CHARSET_FAILED();
if (how & GETKEY_DOLLAR_QUOTE) {
while ((*tdest++ = *++s)) {
if (how & GETKEY_UPDATE_OFFSET) {
if (s - sstart > *misc)
(*misc)++;
}
if (*s == Snull) {
*len = (s - sstart) + 1;
*tdest = '\0';
return buf;
}
}
*len = tdest - buf;
}
else {
*t = '\0';
*len = t - buf;
}
return buf;
}
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
(*misc) += count;
t += count;
# else
# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
count = ucs4toutf8(t, wval);
t += count;
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
(*misc) += count;
} else {
# ifdef HAVE_ICONV
ICONV_CONST char *inptr = inbuf;
const char *codesetstr = nl_langinfo(CODESET);
inbytes = 4;
outbytes = 6;
/* store value in big endian form */
for (i=3;i>=0;i--) {
inbuf[i] = wval & 0xff;
wval >>= 8;
}
/*
* If the code set isn't handled, we'd better
* assume it's US-ASCII rather than just failing
* hopelessly. Solaris has a weird habit of
* returning 646. This is handled by the
* native iconv(), but not by GNU iconv; what's
* more, some versions of the native iconv don't
* handle standard names like ASCII.
*
* This should only be a problem if there's a
* mismatch between the NLS and the iconv in use,
* which probably only means if libiconv is in use.
* We checked at configure time if our libraries
* pulled in _libiconv_version, which should be
* a good test.
*
* It shouldn't ever be NULL, but while we're
* being paranoid...
*/
#ifdef ICONV_FROM_LIBICONV
if (!codesetstr || !*codesetstr)
codesetstr = "US-ASCII";
#endif
cd = iconv_open(codesetstr, "UCS-4BE");
#ifdef ICONV_FROM_LIBICONV
if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
codesetstr = "US-ASCII";
cd = iconv_open(codesetstr, "UCS-4BE");
}
#endif
if (cd == (iconv_t)-1) {
zerr("cannot do charset conversion (iconv failed)");
CHARSET_FAILED();
}
count = iconv(cd, &inptr, &inbytes, &t, &outbytes);
iconv_close(cd);
if (count == (size_t)-1) {
zerr("character not in range");
CHARSET_FAILED();
}
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
(*misc) += count;
# else
zerr("cannot do charset conversion (iconv not available)");
CHARSET_FAILED();
# endif
}
# else
zerr("cannot do charset conversion (NLS not supported)");
CHARSET_FAILED();
# endif
# endif
if (how & GETKEY_DOLLAR_QUOTE) {
char *t2;
for (t2 = tbuf; t2 < t; t2++) {