summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Stephenson <pws@users.sourceforge.net>2009-03-25 11:29:11 +0000
committerPeter Stephenson <pws@users.sourceforge.net>2009-03-25 11:29:11 +0000
commit418671fdb06c1920414056f9b47245aa062f7b6f (patch)
treeee816b81474fb296f014c1901a2e433f452e1314
parentaa3942d2d121ae3cab753d892a81eae53e03b870 (diff)
downloadzsh-418671fdb06c1920414056f9b47245aa062f7b6f.tar.gz
zsh-418671fdb06c1920414056f9b47245aa062f7b6f.zip
Jon Strait: 26778, 26781: extra options for PCRE matching
-rw-r--r--ChangeLog5
-rw-r--r--Doc/Zsh/mod_pcre.yo37
-rw-r--r--Src/Modules/pcre.c57
3 files changed, 87 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index 52e2d6dd7..31f4d6a45 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2009-03-25 Peter Stephenson <pws@csr.com>
+ * Jon Strait: 26778, 26781: Doc/Zsh/mod_pcre.yo,
+ Src/Modules/pcre.c: a couple of extra options for PCRE matching.
+
* Michael Hwang: 26776: Src/builtin.c: improved column alignment
with print -c -P.
@@ -11487,5 +11490,5 @@
*****************************************************
* This is used by the shell to define $ZSH_PATCHLEVEL
-* $Revision: 1.4636 $
+* $Revision: 1.4637 $
*****************************************************
diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo
index 33b864478..9b8d9d6a7 100644
--- a/Doc/Zsh/mod_pcre.yo
+++ b/Doc/Zsh/mod_pcre.yo
@@ -6,7 +6,7 @@ The tt(zsh/pcre) module makes some commands available as builtins:
startitem()
findex(pcre_compile)
-item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))(
+item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))(
Compiles a perl-compatible regular expression.
Option tt(-a) will force the pattern to be anchored.
@@ -15,6 +15,8 @@ Option tt(-m) will compile a multi-line pattern; that is,
tt(^) and tt($) will match newlines within the pattern.
Option tt(-x) will compile an extended pattern, wherein
whitespace and tt(#) comments are ignored.
+Option tt(-s) makes the dot metacharacter match all characters,
+including those that indicate newline.
)
findex(pcre_study)
item(tt(pcre_study))(
@@ -22,7 +24,8 @@ Studies the previously-compiled PCRE which may result in faster
matching.
)
findex(pcre_match)
-item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))(
+item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
+[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
Returns successfully if tt(string) matches the previously-compiled
PCRE.
@@ -35,6 +38,36 @@ var(MATCH) will be set to the entire matched portion of the
string, unless the tt(-v) option is given, in which case the variable
var(var) will be set.
No variables are altered if there is no successful match.
+A tt(-n) option starts searching for a match from the
+byte var(offset) position in var(string). If the tt(-b) option is given,
+the variable var(ZPCRE_OP) will be set to an offset pair string,
+representing the byte offset positions of the entire matched portion
+within the var(string). For example, a var(ZPCRE_OP) set to "32 45" indicates
+that the matched portion began on byte offset 32 and ended on byte offset 44.
+Here, byte offset position 45 is the position directly after the matched
+portion. Keep in mind that the byte position isn't necessarily the same
+as the character position when UTF-8 characters are involved.
+Consequently, the byte offset positions are only to be relied on in the
+context of using them for subsequent searches on var(string), using an offset
+position as an argument to the tt(-n) option. This is mostly
+used to implement the "find all non-overlapping matches" functionality.
+
+A simple example of "find all non-overlapping matches":
+
+example(
+string="The following zip codes: 78884 90210 99513"
+pcre_compile -m "\d{5}"
+accum=()
+pcre_match -b -- $string
+while [[ $? -eq 0 ]] do
+ b=($=ZPCRE_OP)
+ accum+=$MATCH
+ pcre_match -b -n $b[2] -- $string
+done
+print -l $accum
+
+
+)
)
enditem()
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 4f8daff80..08205d144 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -82,6 +82,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
+ if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
if (zpcre_utf8_enabled())
pcre_opts |= PCRE_UTF8;
@@ -137,9 +138,11 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
/**/
static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar,
+ int want_offset_pair, int matchedinarr)
{
char **captures, *match_all, **matches;
+ char offset_all[50];
int capture_start = 1;
if (matchedinarr)
@@ -148,9 +151,14 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
matchvar = "MATCH";
if (substravar == NULL)
substravar = "match";
-
+
/* captures[0] will be entire matched string, [1] first substring */
- if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+ if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+ /* Set to the offsets of the complete match */
+ if (want_offset_pair) {
+ sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+ setsparam("ZPCRE_OP", ztrdup(offset_all));
+ }
match_all = ztrdup(captures[0]);
setsparam(matchvar, match_all);
matches = zarrdup(&captures[capture_start]);
@@ -163,12 +171,32 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
/**/
static int
+getposint(char *instr, char *nam)
+{
+ char *eptr;
+ int ret;
+
+ ret = (int)zstrtol(instr, &eptr, 10);
+ if (*eptr || ret < 0) {
+ zwarnnam(nam, "integer expected: %s", instr);
+ return -1;
+ }
+
+ return ret;
+}
+
+/**/
+static int
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
{
int ret, capcount, *ovec, ovecsize, c;
char *matched_portion = NULL;
char *receptacle = NULL;
int return_value = 1;
+ /* The subject length and offset start are both int values in pcre_exec */
+ int subject_len;
+ int offset_start = 0;
+ int want_offset_pair = 0;
if (pcre_pattern == NULL) {
zwarnnam(nam, "no pattern has been compiled");
@@ -181,6 +209,12 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
if(OPT_HASARG(ops,c='v')) {
matched_portion = OPT_ARG(ops,c);
}
+ if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
+ offset_start = getposint(OPT_ARG(ops,c), nam);
+ }
+ /* For the entire match, 'Return' the offset byte positions instead of the matched string */
+ if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
+
if(!*args) {
zwarnnam(nam, "not enough arguments");
}
@@ -194,12 +228,17 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
ovecsize = (capcount+1)*3;
ovec = zalloc(ovecsize*sizeof(int));
- ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
-
+ subject_len = (int)strlen(*args);
+
+ if (offset_start < 0 || offset_start >= subject_len)
+ ret = PCRE_ERROR_NOMATCH;
+ else
+ ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
+
if (ret==0) return_value = 0;
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
else if (ret>0) {
- zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
+ zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
return_value = 0;
}
else {
@@ -258,7 +297,7 @@ cond_pcre_match(char **a, int id)
break;
}
else if (r>0) {
- zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
+ zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
return_value = 1;
break;
}
@@ -289,8 +328,8 @@ static struct conddef cotab[] = {
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
static struct builtin bintab[] = {
- BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx", NULL),
- BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:", NULL),
+ BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
+ BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL),
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
};