mirror of
git://git.code.sf.net/p/zsh/code
synced 2025-09-29 19:00:57 +02:00
When we went off the end of the array but measured the length implicitly, we got lucky before. After 41308 we were looking up lengths in stale memory. Rename some variables, clean up the logic, be easier to understand. Add tests.
532 lines
13 KiB
C
532 lines
13 KiB
C
/*
|
|
* pcre.c - interface to the PCRE library
|
|
*
|
|
* This file is part of zsh, the Z shell.
|
|
*
|
|
* Copyright (c) 2001, 2002, 2003, 2004, 2007 Clint Adams
|
|
* All rights reserved.
|
|
*
|
|
* Permission is hereby granted, without written agreement and without
|
|
* license or royalty fees, to use, copy, modify, and distribute this
|
|
* software and to distribute modified versions of this software for any
|
|
* purpose, provided that the above copyright notice and the following
|
|
* two paragraphs appear in all copies of this software.
|
|
*
|
|
* In no event shall Clint Adams or the Zsh Development Group be liable
|
|
* to any party for direct, indirect, special, incidental, or consequential
|
|
* damages arising out of the use of this software and its documentation,
|
|
* even if Andrew Main and the Zsh Development Group have been advised of
|
|
* the possibility of such damage.
|
|
*
|
|
* Clint Adams and the Zsh Development Group specifically disclaim any
|
|
* warranties, including, but not limited to, the implied warranties of
|
|
* merchantability and fitness for a particular purpose. The software
|
|
* provided hereunder is on an "as is" basis, and Andrew Main and the
|
|
* Zsh Development Group have no obligation to provide maintenance,
|
|
* support, updates, enhancements, or modifications.
|
|
*
|
|
*/
|
|
|
|
|
|
#include "pcre.mdh"
|
|
#include "pcre.pro"
|
|
|
|
#define CPCRE_PLAIN 0
|
|
|
|
/**/
|
|
#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
|
|
#include <pcre.h>
|
|
|
|
static pcre *pcre_pattern;
|
|
static pcre_extra *pcre_hints;
|
|
|
|
/**/
|
|
static int
|
|
zpcre_utf8_enabled(void)
|
|
{
|
|
#if defined(MULTIBYTE_SUPPORT) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
|
|
static int have_utf8_pcre = -1;
|
|
|
|
/* value can toggle based on MULTIBYTE, so don't
|
|
* be too eager with caching */
|
|
if (have_utf8_pcre < -1)
|
|
return 0;
|
|
|
|
if (!isset(MULTIBYTE))
|
|
return 0;
|
|
|
|
if ((have_utf8_pcre == -1) &&
|
|
(!strcmp(nl_langinfo(CODESET), "UTF-8"))) {
|
|
|
|
if (pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre))
|
|
have_utf8_pcre = -2; /* erk, failed to ask */
|
|
}
|
|
|
|
if (have_utf8_pcre < 0)
|
|
return 0;
|
|
return have_utf8_pcre;
|
|
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
/**/
|
|
static int
|
|
bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
|
|
{
|
|
int pcre_opts = 0, pcre_errptr, target_len;
|
|
const char *pcre_error;
|
|
char *target;
|
|
|
|
if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
|
|
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
|
|
if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
|
|
if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
|
|
if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
|
|
|
|
if (zpcre_utf8_enabled())
|
|
pcre_opts |= PCRE_UTF8;
|
|
|
|
pcre_hints = NULL; /* Is this necessary? */
|
|
|
|
if (pcre_pattern)
|
|
pcre_free(pcre_pattern);
|
|
|
|
target = ztrdup(*args);
|
|
unmetafy(target, &target_len);
|
|
|
|
if ((int)strlen(target) != target_len) {
|
|
zwarnnam(nam, "embedded NULs in PCRE pattern terminate pattern");
|
|
}
|
|
|
|
pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
|
|
|
|
free(target);
|
|
|
|
if (pcre_pattern == NULL)
|
|
{
|
|
zwarnnam(nam, "error in regex: %s", pcre_error);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
#ifdef HAVE_PCRE_STUDY
|
|
|
|
/**/
|
|
static int
|
|
bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int func))
|
|
{
|
|
const char *pcre_error;
|
|
|
|
if (pcre_pattern == NULL)
|
|
{
|
|
zwarnnam(nam, "no pattern has been compiled for study");
|
|
return 1;
|
|
}
|
|
|
|
pcre_hints = pcre_study(pcre_pattern, 0, &pcre_error);
|
|
if (pcre_error != NULL)
|
|
{
|
|
zwarnnam(nam, "error while studying regex: %s", pcre_error);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
#else /* !HAVE_PCRE_STUDY */
|
|
|
|
# define bin_pcre_study bin_notavail
|
|
|
|
/**/
|
|
#endif /* !HAVE_PCRE_STUDY */
|
|
|
|
/**/
|
|
static int
|
|
zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
|
|
char *substravar, int want_offset_pair, int matchedinarr,
|
|
int want_begin_end)
|
|
{
|
|
char **captures, *match_all, **matches;
|
|
char offset_all[50];
|
|
int capture_start = 1;
|
|
|
|
if (matchedinarr) {
|
|
/* bash-style captures[0] entire-matched string in the array */
|
|
capture_start = 0;
|
|
}
|
|
|
|
/* captures[0] will be entire matched string, [1] first substring */
|
|
if (!pcre_get_substring_list(arg, ovec, captured_count, (const char ***)&captures)) {
|
|
int nelem = arrlen(captures)-1;
|
|
/* Set to the offsets of the complete match */
|
|
if (want_offset_pair) {
|
|
sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
|
|
setsparam("ZPCRE_OP", ztrdup(offset_all));
|
|
}
|
|
/*
|
|
* Result strings can contain embedded NULs; the length of each is the
|
|
* difference between the two values in each paired entry in ovec.
|
|
* ovec is length 2*(1+capture_list_length)
|
|
*/
|
|
if (matchvar) {
|
|
match_all = metafy(captures[0], ovec[1] - ovec[0], META_DUP);
|
|
setsparam(matchvar, match_all);
|
|
}
|
|
/*
|
|
* If we're setting match, mbegin, mend we only do
|
|
* so if there were parenthesised matches, for consistency
|
|
* (c.f. regex.c). That's the next block after this one.
|
|
* Here we handle the simpler case where we don't worry about
|
|
* Unicode lengths, etc.
|
|
* Either !want_begin_end (ie, this is bash) or nelem; if bash
|
|
* then we're invoked always, even without nelem results, to
|
|
* set the array variable with one element in it, the complete match.
|
|
*/
|
|
if (substravar &&
|
|
(!want_begin_end || nelem)) {
|
|
char **x, **y;
|
|
int vec_off, i;
|
|
y = &captures[capture_start];
|
|
matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
|
|
for (i = capture_start; i < captured_count; i++, y++) {
|
|
vec_off = 2*i;
|
|
if (*y)
|
|
*x++ = metafy(*y, ovec[vec_off+1]-ovec[vec_off], META_DUP);
|
|
else
|
|
*x++ = NULL;
|
|
}
|
|
*x = NULL;
|
|
setaparam(substravar, matches);
|
|
}
|
|
|
|
if (want_begin_end) {
|
|
/*
|
|
* cond-infix rather than builtin; also not bash; so we set a bunch
|
|
* of variables and arrays to values which require handling Unicode
|
|
* lengths
|
|
*/
|
|
char *ptr = arg;
|
|
zlong offs = 0;
|
|
int clen, leftlen;
|
|
|
|
/* Count the characters before the match */
|
|
MB_CHARINIT();
|
|
leftlen = ovec[0];
|
|
while (leftlen) {
|
|
offs++;
|
|
clen = MB_CHARLEN(ptr, leftlen);
|
|
ptr += clen;
|
|
leftlen -= clen;
|
|
}
|
|
setiparam("MBEGIN", offs + !isset(KSHARRAYS));
|
|
/* Add on the characters in the match */
|
|
leftlen = ovec[1] - ovec[0];
|
|
while (leftlen) {
|
|
offs++;
|
|
clen = MB_CHARLEN(ptr, leftlen);
|
|
ptr += clen;
|
|
leftlen -= clen;
|
|
}
|
|
setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
|
|
if (nelem) {
|
|
char **mbegin, **mend, **bptr, **eptr;
|
|
int i, *ipair;
|
|
|
|
bptr = mbegin = zalloc(sizeof(char*)*(nelem+1));
|
|
eptr = mend = zalloc(sizeof(char*)*(nelem+1));
|
|
|
|
for (ipair = ovec + 2, i = 0;
|
|
i < nelem;
|
|
ipair += 2, i++, bptr++, eptr++)
|
|
{
|
|
char buf[DIGBUFSIZE];
|
|
ptr = arg;
|
|
offs = 0;
|
|
/* Find the start offset */
|
|
MB_CHARINIT();
|
|
leftlen = ipair[0];
|
|
while (leftlen > 0) {
|
|
offs++;
|
|
clen = MB_CHARLEN(ptr, leftlen);
|
|
ptr += clen;
|
|
leftlen -= clen;
|
|
}
|
|
convbase(buf, offs + !isset(KSHARRAYS), 10);
|
|
*bptr = ztrdup(buf);
|
|
/* Continue to the end offset */
|
|
leftlen = ipair[1] - ipair[0];
|
|
while (leftlen) {
|
|
offs++;
|
|
clen = MB_CHARLEN(ptr, leftlen);
|
|
ptr += clen;
|
|
leftlen -= clen;
|
|
}
|
|
convbase(buf, offs + !isset(KSHARRAYS) - 1, 10);
|
|
*eptr = ztrdup(buf);
|
|
}
|
|
*bptr = *eptr = NULL;
|
|
|
|
setaparam("mbegin", mbegin);
|
|
setaparam("mend", mend);
|
|
}
|
|
}
|
|
|
|
pcre_free_substring_list((const char **)captures);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
static int
|
|
getposint(char *instr, char *nam)
|
|
{
|
|
char *eptr;
|
|
int ret;
|
|
|
|
ret = (int)zstrtol(instr, &eptr, 10);
|
|
if (*eptr || ret < 0) {
|
|
zwarnnam(nam, "integer expected: %s", instr);
|
|
return -1;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**/
|
|
static int
|
|
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|
{
|
|
int ret, capcount, *ovec, ovecsize, c;
|
|
char *matched_portion = NULL;
|
|
char *plaintext = NULL;
|
|
char *receptacle = NULL;
|
|
int return_value = 1;
|
|
/* The subject length and offset start are both int values in pcre_exec */
|
|
int subject_len;
|
|
int offset_start = 0;
|
|
int want_offset_pair = 0;
|
|
|
|
if (pcre_pattern == NULL) {
|
|
zwarnnam(nam, "no pattern has been compiled");
|
|
return 1;
|
|
}
|
|
|
|
matched_portion = "MATCH";
|
|
receptacle = "match";
|
|
if(OPT_HASARG(ops,c='a')) {
|
|
receptacle = OPT_ARG(ops,c);
|
|
}
|
|
if(OPT_HASARG(ops,c='v')) {
|
|
matched_portion = OPT_ARG(ops,c);
|
|
}
|
|
if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
|
|
if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
|
|
return 1;
|
|
}
|
|
/* For the entire match, 'Return' the offset byte positions instead of the matched string */
|
|
if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
|
|
|
|
if ((ret = pcre_fullinfo(pcre_pattern, pcre_hints, PCRE_INFO_CAPTURECOUNT, &capcount)))
|
|
{
|
|
zwarnnam(nam, "error %d in fullinfo", ret);
|
|
return 1;
|
|
}
|
|
|
|
ovecsize = (capcount+1)*3;
|
|
ovec = zalloc(ovecsize*sizeof(int));
|
|
|
|
plaintext = ztrdup(*args);
|
|
unmetafy(plaintext, &subject_len);
|
|
|
|
if (offset_start > 0 && offset_start >= subject_len)
|
|
ret = PCRE_ERROR_NOMATCH;
|
|
else
|
|
ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
|
|
|
|
if (ret==0) return_value = 0;
|
|
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
|
|
else if (ret>0) {
|
|
zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
|
|
want_offset_pair, 0, 0);
|
|
return_value = 0;
|
|
}
|
|
else {
|
|
zwarnnam(nam, "error in pcre_exec [%d]", ret);
|
|
}
|
|
|
|
if (ovec)
|
|
zfree(ovec, ovecsize*sizeof(int));
|
|
|
|
return return_value;
|
|
}
|
|
|
|
/**/
|
|
static int
|
|
cond_pcre_match(char **a, int id)
|
|
{
|
|
pcre *pcre_pat;
|
|
const char *pcre_err;
|
|
char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar, *svar;
|
|
int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
|
|
int lhstr_plain_len, rhre_plain_len;
|
|
int return_value = 0;
|
|
|
|
if (zpcre_utf8_enabled())
|
|
pcre_opts |= PCRE_UTF8;
|
|
if (isset(REMATCHPCRE) && !isset(CASEMATCH))
|
|
pcre_opts |= PCRE_CASELESS;
|
|
|
|
lhstr = cond_str(a,0,0);
|
|
rhre = cond_str(a,1,0);
|
|
lhstr_plain = ztrdup(lhstr);
|
|
rhre_plain = ztrdup(rhre);
|
|
unmetafy(lhstr_plain, &lhstr_plain_len);
|
|
unmetafy(rhre_plain, &rhre_plain_len);
|
|
pcre_pat = NULL;
|
|
ov = NULL;
|
|
ovsize = 0;
|
|
|
|
if (isset(BASHREMATCH)) {
|
|
svar = NULL;
|
|
avar = "BASH_REMATCH";
|
|
} else {
|
|
svar = "MATCH";
|
|
avar = "match";
|
|
}
|
|
|
|
switch(id) {
|
|
case CPCRE_PLAIN:
|
|
if ((int)strlen(rhre_plain) != rhre_plain_len) {
|
|
zwarn("embedded NULs in PCRE pattern terminate pattern");
|
|
}
|
|
pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
|
|
if (pcre_pat == NULL) {
|
|
zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
|
|
break;
|
|
}
|
|
pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
|
|
ovsize = (capcnt+1)*3;
|
|
ov = zalloc(ovsize*sizeof(int));
|
|
r = pcre_exec(pcre_pat, NULL, lhstr_plain, lhstr_plain_len, 0, 0, ov, ovsize);
|
|
/* r < 0 => error; r==0 match but not enough size in ov
|
|
* r > 0 => (r-1) substrings found; r==1 => no substrings
|
|
*/
|
|
if (r==0) {
|
|
zwarn("reportable zsh problem: pcre_exec() returned 0");
|
|
return_value = 1;
|
|
break;
|
|
}
|
|
else if (r==PCRE_ERROR_NOMATCH) {
|
|
return_value = 0; /* no match */
|
|
break;
|
|
}
|
|
else if (r<0) {
|
|
zwarn("pcre_exec() error [%d]", r);
|
|
break;
|
|
}
|
|
else if (r>0) {
|
|
zpcre_get_substrings(lhstr_plain, ov, r, svar, avar, 0,
|
|
isset(BASHREMATCH),
|
|
!isset(BASHREMATCH));
|
|
return_value = 1;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (lhstr_plain)
|
|
free(lhstr_plain);
|
|
if(rhre_plain)
|
|
free(rhre_plain);
|
|
if (pcre_pat)
|
|
pcre_free(pcre_pat);
|
|
if (ov)
|
|
zfree(ov, ovsize*sizeof(int));
|
|
|
|
return return_value;
|
|
}
|
|
|
|
static struct conddef cotab[] = {
|
|
CONDDEF("pcre-match", CONDF_INFIX, cond_pcre_match, 0, 0, CPCRE_PLAIN)
|
|
/* CONDDEF can register =~ but it won't be found */
|
|
};
|
|
|
|
/**/
|
|
#else /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
|
|
|
|
# define bin_pcre_compile bin_notavail
|
|
# define bin_pcre_study bin_notavail
|
|
# define bin_pcre_match bin_notavail
|
|
|
|
/**/
|
|
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
|
|
|
|
static struct builtin bintab[] = {
|
|
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
|
|
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL),
|
|
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
|
|
};
|
|
|
|
|
|
static struct features module_features = {
|
|
bintab, sizeof(bintab)/sizeof(*bintab),
|
|
#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
|
|
cotab, sizeof(cotab)/sizeof(*cotab),
|
|
#else /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
|
|
NULL, 0,
|
|
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
|
|
NULL, 0,
|
|
NULL, 0,
|
|
0
|
|
};
|
|
|
|
|
|
/**/
|
|
int
|
|
setup_(UNUSED(Module m))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
int
|
|
features_(Module m, char ***features)
|
|
{
|
|
*features = featuresarray(m, &module_features);
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
int
|
|
enables_(Module m, int **enables)
|
|
{
|
|
return handlefeatures(m, &module_features, enables);
|
|
}
|
|
|
|
/**/
|
|
int
|
|
boot_(UNUSED(Module m))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/**/
|
|
int
|
|
cleanup_(Module m)
|
|
{
|
|
return setfeatureenables(m, &module_features, NULL);
|
|
}
|
|
|
|
/**/
|
|
int
|
|
finish_(UNUSED(Module m))
|
|
{
|
|
return 0;
|
|
}
|