1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2025-09-03 10:21:46 +02:00

Jon Strait: 26778, 26781: extra options for PCRE matching

This commit is contained in:
Peter Stephenson 2009-03-25 11:29:11 +00:00
parent aa3942d2d1
commit 418671fdb0
3 changed files with 87 additions and 12 deletions

View file

@ -1,5 +1,8 @@
2009-03-25 Peter Stephenson <pws@csr.com>
* Jon Strait: 26778, 26781: Doc/Zsh/mod_pcre.yo,
Src/Modules/pcre.c: a couple of extra options for PCRE matching.
* Michael Hwang: 26776: Src/builtin.c: improved column alignment
with print -c -P.
@ -11487,5 +11490,5 @@
*****************************************************
* This is used by the shell to define $ZSH_PATCHLEVEL
* $Revision: 1.4636 $
* $Revision: 1.4637 $
*****************************************************

View file

@ -6,7 +6,7 @@ The tt(zsh/pcre) module makes some commands available as builtins:
startitem()
findex(pcre_compile)
item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))(
item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))(
Compiles a perl-compatible regular expression.
Option tt(-a) will force the pattern to be anchored.
@ -15,6 +15,8 @@ Option tt(-m) will compile a multi-line pattern; that is,
tt(^) and tt($) will match newlines within the pattern.
Option tt(-x) will compile an extended pattern, wherein
whitespace and tt(#) comments are ignored.
Option tt(-s) makes the dot metacharacter match all characters,
including those that indicate newline.
)
findex(pcre_study)
item(tt(pcre_study))(
@ -22,7 +24,8 @@ Studies the previously-compiled PCRE which may result in faster
matching.
)
findex(pcre_match)
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))(
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
Returns successfully if tt(string) matches the previously-compiled
PCRE.
@ -35,6 +38,36 @@ var(MATCH) will be set to the entire matched portion of the
string, unless the tt(-v) option is given, in which case the variable
var(var) will be set.
No variables are altered if there is no successful match.
A tt(-n) option starts searching for a match from the
byte var(offset) position in var(string). If the tt(-b) option is given,
the variable var(ZPCRE_OP) will be set to an offset pair string,
representing the byte offset positions of the entire matched portion
within the var(string). For example, a var(ZPCRE_OP) set to "32 45" indicates
that the matched portion began on byte offset 32 and ended on byte offset 44.
Here, byte offset position 45 is the position directly after the matched
portion. Keep in mind that the byte position isn't necessarily the same
as the character position when UTF-8 characters are involved.
Consequently, the byte offset positions are only to be relied on in the
context of using them for subsequent searches on var(string), using an offset
position as an argument to the tt(-n) option. This is mostly
used to implement the "find all non-overlapping matches" functionality.
A simple example of "find all non-overlapping matches":
example(
string="The following zip codes: 78884 90210 99513"
pcre_compile -m "\d{5}"
accum=()
pcre_match -b -- $string
while [[ $? -eq 0 ]] do
b=($=ZPCRE_OP)
accum+=$MATCH
pcre_match -b -n $b[2] -- $string
done
print -l $accum
)
)
enditem()

View file

@ -82,6 +82,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
if (zpcre_utf8_enabled())
pcre_opts |= PCRE_UTF8;
@ -137,9 +138,11 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
/**/
static int
zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar,
int want_offset_pair, int matchedinarr)
{
char **captures, *match_all, **matches;
char offset_all[50];
int capture_start = 1;
if (matchedinarr)
@ -148,9 +151,14 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
matchvar = "MATCH";
if (substravar == NULL)
substravar = "match";
/* captures[0] will be entire matched string, [1] first substring */
if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
/* Set to the offsets of the complete match */
if (want_offset_pair) {
sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
setsparam("ZPCRE_OP", ztrdup(offset_all));
}
match_all = ztrdup(captures[0]);
setsparam(matchvar, match_all);
matches = zarrdup(&captures[capture_start]);
@ -161,6 +169,22 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
return 0;
}
/**/
static int
getposint(char *instr, char *nam)
{
char *eptr;
int ret;
ret = (int)zstrtol(instr, &eptr, 10);
if (*eptr || ret < 0) {
zwarnnam(nam, "integer expected: %s", instr);
return -1;
}
return ret;
}
/**/
static int
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
@ -169,6 +193,10 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
char *matched_portion = NULL;
char *receptacle = NULL;
int return_value = 1;
/* The subject length and offset start are both int values in pcre_exec */
int subject_len;
int offset_start = 0;
int want_offset_pair = 0;
if (pcre_pattern == NULL) {
zwarnnam(nam, "no pattern has been compiled");
@ -181,6 +209,12 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
if(OPT_HASARG(ops,c='v')) {
matched_portion = OPT_ARG(ops,c);
}
if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
offset_start = getposint(OPT_ARG(ops,c), nam);
}
/* For the entire match, 'Return' the offset byte positions instead of the matched string */
if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
if(!*args) {
zwarnnam(nam, "not enough arguments");
}
@ -194,12 +228,17 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
ovecsize = (capcount+1)*3;
ovec = zalloc(ovecsize*sizeof(int));
ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
subject_len = (int)strlen(*args);
if (offset_start < 0 || offset_start >= subject_len)
ret = PCRE_ERROR_NOMATCH;
else
ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
if (ret==0) return_value = 0;
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
else if (ret>0) {
zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
return_value = 0;
}
else {
@ -258,7 +297,7 @@ cond_pcre_match(char **a, int id)
break;
}
else if (r>0) {
zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
return_value = 1;
break;
}
@ -289,8 +328,8 @@ static struct conddef cotab[] = {
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
static struct builtin bintab[] = {
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx", NULL),
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:", NULL),
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL),
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
};