1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2026-01-06 09:41:07 +01:00

22408: support for multibyte characters in patterns

This commit is contained in:
Peter Stephenson 2006-04-09 21:47:21 +00:00
parent 82dc72e034
commit ef330a5dfd
8 changed files with 587 additions and 417 deletions

View file

@ -1,6 +1,11 @@
2006-04-09 Peter Stephenson <p.w.stephenson@ntlworld.com>
* 11407: Functions/MIME/zsh-mime-handler: and it needs NULL_GLOB,
* 22408: Doc/Zsh/expn.yo, Doc/Zsh/options.yo, Src/options.c,
Src/pattern.c, Src/utils.c, Src/zsh.h, Src/Zle/zle.h: add
MULTIBYTE option, (#u) and (#U) globbing flags, and support
for multibyte characters in patterns.
* 22407: Functions/MIME/zsh-mime-handler: and it needs NULL_GLOB,
too.
2006-04-06 Peter Stephenson <pws@csr.com>

View file

@ -1461,20 +1461,20 @@ tt(LPAR()#)var(X)tt(RPAR()) where var(X) may have one of the following
forms:
startitem()
item(i)(
item(tt(i))(
Case insensitive: upper or lower case characters in the pattern match
upper or lower case characters.
)
item(l)(
item(tt(l))(
Lower case characters in the pattern match upper or lower case
characters; upper case characters in the pattern still only match
upper case characters.
)
item(I)(
item(tt(I))(
Case sensitive: locally negates the effect of tt(i) or tt(l) from
that point on.
)
item(b)(
item(tt(b))(
Activate backreferences for parenthesised groups in the pattern;
this does not work in filename generation. When a pattern with a set of
active parentheses is matched, the strings matched by the groups are
@ -1525,11 +1525,11 @@ start and end indices are set to -1.
Pattern matching with backreferences is slightly slower than without.
)
item(B)(
item(tt(B))(
Deactivate backreferences, negating the effect of the tt(b) flag from that
point on.
)
item(m)(
item(tt(m))(
Set references to the match data for the entire string matched; this is
similar to backreferencing and does not work in filename generation. The
flag must be in effect at the end of the pattern, i.e. not local to a
@ -1550,7 +1550,7 @@ Unlike backreferences, there is no speed penalty for using match
references, other than the extra substitutions required for the
replacement strings in cases such as the example shown.
)
item(M)(
item(tt(M))(
Deactivate the tt(m) flag, hence no references to match data will be
created.
)
@ -1596,6 +1596,19 @@ the latter case the `tt((#b))' is useful for backreferences and the
`tt((#q.))' will be ignored. Note that colon modifiers in the glob
qualifiers are also not applied in ordinary pattern matching.
)
item(tt(u))(
Respect the current locale in determining the presence of multibyte
characters in a pattern, provided the shell was compiled with
tt(MULTIBYTE_SUPPORT). This overrides the tt(MULTIBYTE)
option; the default behaviour is taken from the option. Compare tt(U).
(Mnemonic: typically multibyte characters are from Unicode in the UTF-8
encoding, although any extension of ASCII supported by the system
library may be used.)
)
item(tt(U))(
All characters are considered to be a single byte long. The opposite
of tt(u). This overrides the tt(MULTIBYTE) option.
)
enditem()
For example, the test string tt(fooxx) can be matched by the pattern

View file

@ -411,6 +411,20 @@ item(tt(MARK_DIRS) (tt(-8), ksh: tt(-X)))(
Append a trailing `tt(/)' to all directory
names resulting from filename generation (globbing).
)
pindex(MULTIBYTE)
cindex(characters, multibyte, in expansion and globbing)
cindex(multibyte characters, in expansion and globbing)
item(tt(MULTIBYTE))(
Respect multibyte characters when found during pattern matching.
When this option is set, characters strings are examined using the
system library to determine how many bytes form a character, depending
on the current locale. If the option is unset
(or the shell was not compiled with the configuration option
tt(MULTIBYTE_SUPPORT)) a single byte is always treated as a single
character. The option will eventually be extended to cover expansion.
Note, however, that it does not affect the shellʼs editor, which always
uses the locale to determine multibyte characters.
)
pindex(NOMATCH)
cindex(globbing, no matches)
item(tt(NOMATCH) (tt(PLUS()3)) <C> <Z>)(

View file

@ -33,20 +33,6 @@ typedef wchar_t *ZLE_STRING_T;
typedef wint_t ZLE_INT_T;
#define ZLE_CHAR_SIZE sizeof(wchar_t)
/*
* MB_CUR_MAX is the maximum number of bytes that a single wide
* character will convert into. We use it to keep strings
* sufficiently long. It should always be defined, but if it isn't
* just assume we are using Unicode which requires 6 characters.
* (Note that it's not necessarily defined to a constant.)
*/
#ifndef MB_CUR_MAX
#define MB_CUR_MAX 6
#endif
/* Convert character or string to wide character or string */
#define ZWC(c) L ## c
#define ZWS(s) L ## s
#define ZLEEOF WEOF
@ -96,10 +82,6 @@ typedef char *ZLE_STRING_T;
typedef int ZLE_INT_T;
#define ZLE_CHAR_SIZE sizeof(ZLE_CHAR_T)
/* Leave character or string as is. */
#define ZWC(c) c
#define ZWS(s) s
#define ZLEEOF EOF
/* Functions that operate on a ZLE_STRING_T. */

View file

@ -166,6 +166,7 @@ static struct optname optns[] = {
{{NULL, "markdirs", 0}, MARKDIRS},
{{NULL, "menucomplete", 0}, MENUCOMPLETE},
{{NULL, "monitor", OPT_SPECIAL}, MONITOR},
{{NULL, "multibyte", 0/*TBD*/}, MULTIBYTE},
{{NULL, "multios", OPT_EMULATE|OPT_ZSH}, MULTIOS},
{{NULL, "nomatch", OPT_EMULATE|OPT_NONBOURNE},NOMATCH},
{{NULL, "notify", OPT_ZSH}, NOTIFY},

File diff suppressed because it is too large Load diff

View file

@ -2702,7 +2702,7 @@ wcsiident(wchar_t c)
} else if (len == 1 && iascii(*outstr)) {
return iident(*outstr);
} else {
/* not currently allowed, see above */
/* TODO: not currently allowed, see above */
return 0;
}
}

View file

@ -1161,6 +1161,7 @@ struct patprog {
#define GF_IGNCASE 0x0200
#define GF_BACKREF 0x0400
#define GF_MATCHREF 0x0800
#define GF_MULTIBYTE 0x1000 /* Use multibyte if supported by build */
/* Dummy Patprog pointers. Used mainly in executable code, but the
* pattern code needs to know about it, too. */
@ -1596,6 +1597,7 @@ enum {
MARKDIRS,
MENUCOMPLETE,
MONITOR,
MULTIBYTE,
MULTIOS,
NOMATCH,
NOTIFY,
@ -1924,4 +1926,26 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
#define MB_INCOMPLETE ((size_t)-2)
#define MB_INVALID ((size_t)-1)
/*
* MB_CUR_MAX is the maximum number of bytes that a single wide
* character will convert into. We use it to keep strings
* sufficiently long. It should always be defined, but if it isn't
* just assume we are using Unicode which requires 6 characters.
* (Note that it's not necessarily defined to a constant.)
*/
#ifndef MB_CUR_MAX
#define MB_CUR_MAX 6
#endif
/* Convert character or string to wide character or string */
#define ZWC(c) L ## c
#define ZWS(s) L ## s
#else
/* Leave character or string as is. */
#define ZWC(c) c
#define ZWS(s) s
#endif