mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-23 23:34:48 +00:00
Add a regex crate to the Rust distribution.
Also adds a regex_macros crate, which provides natively compiled regular expressions with a syntax extension. Closes #3591. RFC: 0007-regexps
This commit is contained in:
parent
66486518d5
commit
b8b7484703
@ -51,8 +51,8 @@
|
||||
|
||||
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
|
||||
uuid serialize sync getopts collections num test time rand \
|
||||
workcache url log
|
||||
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
|
||||
workcache url log regex
|
||||
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
|
||||
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
|
||||
TOOLS := compiletest rustdoc rustc
|
||||
|
||||
@ -84,6 +84,8 @@ DEPS_rand := std
|
||||
DEPS_url := std collections
|
||||
DEPS_workcache := std serialize collections log
|
||||
DEPS_log := std sync
|
||||
DEPS_regex := std collections
|
||||
DEPS_regex_macros = syntax std regex
|
||||
|
||||
TOOL_DEPS_compiletest := test green rustuv getopts
|
||||
TOOL_DEPS_rustdoc := rustdoc native
|
||||
|
@ -19,6 +19,7 @@ Source layout:
|
||||
| `libfourcc/` | Data format identifier library |
|
||||
| `libgetopts/` | Get command-line-options library |
|
||||
| `libglob/` | Unix glob patterns library |
|
||||
| `libregex/` | Regular expressions |
|
||||
| `libsemver/` | Rust's semantic versioning library |
|
||||
| `libserialize/` | Encode-Decode types library |
|
||||
| `libsync/` | Concurrency mechanisms and primitives |
|
||||
|
@ -41,6 +41,7 @@ li {list-style-type: none; }
|
||||
* [The `native` 1:1 threading runtime](native/index.html)
|
||||
* [The `num` arbitrary precision numerics library](num/index.html)
|
||||
* [The `rand` library for random numbers and distributions](rand/index.html)
|
||||
* [The `regex` library for regular expressions](regex/index.html)
|
||||
* [The `rustc` compiler](rustc/index.html)
|
||||
* [The `rustuv` M:N I/O library](rustuv/index.html)
|
||||
* [The `semver` version collation library](semver/index.html)
|
||||
|
109
src/etc/regex-match-tests.py
Executable file
109
src/etc/regex-match-tests.py
Executable file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import argparse
|
||||
import datetime
|
||||
import os.path as path
|
||||
|
||||
|
||||
def print_tests(tests):
|
||||
print('\n'.join([test_tostr(t) for t in tests]))
|
||||
|
||||
|
||||
def read_tests(f):
|
||||
basename, _ = path.splitext(path.basename(f))
|
||||
tests = []
|
||||
for lineno, line in enumerate(open(f), 1):
|
||||
fields = filter(None, map(str.strip, line.split('\t')))
|
||||
if not (4 <= len(fields) <= 5) \
|
||||
or 'E' not in fields[0] or fields[0][0] == '#':
|
||||
continue
|
||||
|
||||
opts, pat, text, sgroups = fields[0:4]
|
||||
groups = [] # groups as integer ranges
|
||||
if sgroups == 'NOMATCH':
|
||||
groups = [None]
|
||||
elif ',' in sgroups:
|
||||
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
|
||||
for g in noparen:
|
||||
s, e = map(str.strip, g.split(','))
|
||||
if s == '?' and e == '?':
|
||||
groups.append(None)
|
||||
else:
|
||||
groups.append((int(s), int(e)))
|
||||
else:
|
||||
# This skips tests that should result in an error.
|
||||
# There aren't many, so I think we can just capture those
|
||||
# manually. Possibly fix this in future.
|
||||
continue
|
||||
|
||||
if pat == 'SAME':
|
||||
pat = tests[-1][1]
|
||||
if '$' in opts:
|
||||
pat = pat.decode('string_escape')
|
||||
text = text.decode('string_escape')
|
||||
if 'i' in opts:
|
||||
pat = '(?i)%s' % pat
|
||||
|
||||
name = '%s_%d' % (basename, lineno)
|
||||
tests.append((name, pat, text, groups))
|
||||
return tests
|
||||
|
||||
|
||||
def test_tostr(t):
|
||||
lineno, pat, text, groups = t
|
||||
options = map(group_tostr, groups)
|
||||
return 'mat!(match_%s, r"%s", r"%s", %s)' \
|
||||
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
|
||||
|
||||
|
||||
def group_tostr(g):
|
||||
if g is None:
|
||||
return 'None'
|
||||
else:
|
||||
return 'Some((%d, %d))' % (g[0], g[1])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate match tests from an AT&T POSIX test file.')
|
||||
aa = parser.add_argument
|
||||
aa('files', nargs='+',
|
||||
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
|
||||
args = parser.parse_args()
|
||||
|
||||
tests = []
|
||||
for f in args.files:
|
||||
tests += read_tests(f)
|
||||
|
||||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// ignore-tidy-linelength
|
||||
|
||||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
|
||||
// on {date}.
|
||||
'''
|
||||
print(tpl.format(date=str(datetime.datetime.now())))
|
||||
|
||||
for f in args.files:
|
||||
print('// Tests from %s' % path.basename(f))
|
||||
print_tests(read_tests(f))
|
||||
print('')
|
183
src/etc/regex-unicode-tables.py
Executable file
183
src/etc/regex-unicode-tables.py
Executable file
@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
import datetime
|
||||
import urllib2
|
||||
|
||||
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
|
||||
DATA = 'UnicodeData.txt'
|
||||
SCRIPTS = 'Scripts.txt'
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
|
||||
def as_4byte_uni(n):
|
||||
s = hex(n)[2:]
|
||||
return '\\U%s%s' % ('0' * (8 - len(s)), s)
|
||||
|
||||
|
||||
def expand_cat(c):
|
||||
return expanded_categories.get(c, []) + [c]
|
||||
|
||||
|
||||
def is_valid_unicode(n):
|
||||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
|
||||
|
||||
|
||||
def read_cats(f):
|
||||
assigned = defaultdict(list)
|
||||
for row in csv.reader(f, delimiter=';'):
|
||||
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
|
||||
if not is_valid_unicode(hex):
|
||||
continue
|
||||
for cat in cats:
|
||||
assigned[cat].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def read_scripts(f):
|
||||
assigned = defaultdict(list)
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
hexes, name = map(str.strip, line.split(';'))[:2]
|
||||
name = name[:name.index('#')].strip()
|
||||
if '..' not in hexes:
|
||||
hex = int(hexes, 16)
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
else:
|
||||
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
|
||||
for hex in xrange(hex1, hex2 + 1):
|
||||
if is_valid_unicode(hex):
|
||||
assigned[name].append(hex)
|
||||
return assigned
|
||||
|
||||
|
||||
def group(letters):
|
||||
letters = sorted(set(letters))
|
||||
grouped = []
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
|
||||
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
grouped.append((cur_start, cur_end))
|
||||
cur_start, cur_end = letter, letter
|
||||
grouped.append((cur_start, cur_end))
|
||||
return grouped
|
||||
|
||||
|
||||
def ranges_to_rust(rs):
|
||||
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
|
||||
return ',\n '.join(rs)
|
||||
|
||||
|
||||
def groups_to_rust(groups):
|
||||
rust_groups = []
|
||||
for group_name in sorted(groups):
|
||||
rust_groups.append('("%s", &[\n %s\n ]),'
|
||||
% (group_name, ranges_to_rust(groups[group_name])))
|
||||
return '\n'.join(rust_groups)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate Unicode character class tables.')
|
||||
aa = parser.add_argument
|
||||
aa('--local', action='store_true',
|
||||
help='When set, Scripts.txt and UnicodeData.txt will be read from '
|
||||
'the CWD.')
|
||||
aa('--base-url', type=str, default=BASE_URL,
|
||||
help='The base URL to use for downloading Unicode data files.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local:
|
||||
cats = read_cats(open(DATA))
|
||||
scripts = read_scripts(open(SCRIPTS))
|
||||
else:
|
||||
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
|
||||
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
|
||||
|
||||
# Get Rust code for all Unicode general categories and scripts.
|
||||
combined = dict(cats, **scripts)
|
||||
unigroups = groups_to_rust({k: group(letters)
|
||||
for k, letters in combined.items()})
|
||||
|
||||
# Now get Perl character classes that are Unicode friendly.
|
||||
perld = range(ord('0'), ord('9') + 1)
|
||||
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
|
||||
|
||||
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
|
||||
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
|
||||
|
||||
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
|
||||
perlw = [ord('_')] + perld + low + up
|
||||
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
|
||||
|
||||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
|
||||
// on {date}.
|
||||
|
||||
use parse::{{Class, NamedClasses}};
|
||||
|
||||
pub static UNICODE_CLASSES: NamedClasses = &[
|
||||
|
||||
{groups}
|
||||
|
||||
];
|
||||
|
||||
pub static PERLD: Class = &[
|
||||
{dgroups}
|
||||
];
|
||||
|
||||
pub static PERLS: Class = &[
|
||||
{sgroups}
|
||||
];
|
||||
|
||||
pub static PERLW: Class = &[
|
||||
{wgroups}
|
||||
];
|
||||
'''
|
||||
now = datetime.datetime.now()
|
||||
print(tpl.format(date=str(now), groups=unigroups,
|
||||
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
|
274
src/libregex/compile.rs
Normal file
274
src/libregex/compile.rs
Normal file
@ -0,0 +1,274 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// Enable this to squash warnings due to exporting pieces of the representation
|
||||
// for use with the regex! macro. See lib.rs for explanation.
|
||||
#![allow(visible_private_types)]
|
||||
|
||||
use std::cmp;
|
||||
use std::iter;
|
||||
use parse;
|
||||
use parse::{
|
||||
Flags, FLAG_EMPTY,
|
||||
Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt,
|
||||
Rep,
|
||||
ZeroOne, ZeroMore, OneMore,
|
||||
};
|
||||
|
||||
type InstIdx = uint;
|
||||
|
||||
#[deriving(Show, Clone)]
|
||||
pub enum Inst {
|
||||
// When a Match instruction is executed, the current thread is successful.
|
||||
Match,
|
||||
|
||||
// The OneChar instruction matches a literal character.
|
||||
// The flags indicate whether to do a case insensitive match.
|
||||
OneChar(char, Flags),
|
||||
|
||||
// The CharClass instruction tries to match one input character against
|
||||
// the range of characters given.
|
||||
// The flags indicate whether to do a case insentivie match and whether
|
||||
// the character class is negated or not.
|
||||
CharClass(Vec<(char, char)>, Flags),
|
||||
|
||||
// Matches any character except new lines.
|
||||
// The flags indicate whether to include the '\n' character.
|
||||
Any(Flags),
|
||||
|
||||
// Matches the beginning of the string, consumes no characters.
|
||||
// The flags indicate whether it matches if the preceding character
|
||||
// is a new line.
|
||||
EmptyBegin(Flags),
|
||||
|
||||
// Matches the end of the string, consumes no characters.
|
||||
// The flags indicate whether it matches if the proceding character
|
||||
// is a new line.
|
||||
EmptyEnd(Flags),
|
||||
|
||||
// Matches a word boundary (\w on one side and \W \A or \z on the other),
|
||||
// and consumes no character.
|
||||
// The flags indicate whether this matches a word boundary or something
|
||||
// that isn't a word boundary.
|
||||
EmptyWordBoundary(Flags),
|
||||
|
||||
// Saves the current position in the input string to the Nth save slot.
|
||||
Save(uint),
|
||||
|
||||
// Jumps to the instruction at the index given.
|
||||
Jump(InstIdx),
|
||||
|
||||
// Jumps to the instruction at the first index given. If that leads to
|
||||
// a failing state, then the instruction at the second index given is
|
||||
// tried.
|
||||
Split(InstIdx, InstIdx),
|
||||
}
|
||||
|
||||
/// Program represents a compiled regular expression. Once an expression is
|
||||
/// compiled, its representation is immutable and will never change.
|
||||
///
|
||||
/// All of the data in a compiled expression is wrapped in "MaybeStatic" or
|
||||
/// "MaybeOwned" types so that a `Program` can be represented as static data.
|
||||
/// (This makes it convenient and efficient for use with the `regex!` macro.)
|
||||
#[deriving(Clone)]
|
||||
pub struct Program {
|
||||
/// A sequence of instructions.
|
||||
pub insts: Vec<Inst>,
|
||||
/// If the regular expression requires a literal prefix in order to have a
|
||||
/// match, that prefix is stored here. (It's used in the VM to implement
|
||||
/// an optimization.)
|
||||
pub prefix: ~str,
|
||||
}
|
||||
|
||||
impl Program {
|
||||
/// Compiles a Regex given its AST.
|
||||
pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) {
|
||||
let mut c = Compiler {
|
||||
insts: Vec::with_capacity(100),
|
||||
names: Vec::with_capacity(10),
|
||||
};
|
||||
|
||||
c.insts.push(Save(0));
|
||||
c.compile(ast);
|
||||
c.insts.push(Save(1));
|
||||
c.insts.push(Match);
|
||||
|
||||
// Try to discover a literal string prefix.
|
||||
// This is a bit hacky since we have to skip over the initial
|
||||
// 'Save' instruction.
|
||||
let mut pre = StrBuf::with_capacity(5);
|
||||
for i in iter::range(1, c.insts.len()) {
|
||||
match *c.insts.get(i) {
|
||||
OneChar(c, FLAG_EMPTY) => pre.push_char(c),
|
||||
_ => break
|
||||
}
|
||||
}
|
||||
|
||||
let names = c.names.as_slice().into_owned();
|
||||
let prog = Program {
|
||||
insts: c.insts,
|
||||
prefix: pre.into_owned(),
|
||||
};
|
||||
(prog, names)
|
||||
}
|
||||
|
||||
/// Returns the total number of capture groups in the regular expression.
|
||||
/// This includes the zeroth capture.
|
||||
pub fn num_captures(&self) -> uint {
|
||||
let mut n = 0;
|
||||
for inst in self.insts.iter() {
|
||||
match *inst {
|
||||
Save(c) => n = cmp::max(n, c+1),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
// There's exactly 2 Save slots for every capture.
|
||||
n / 2
|
||||
}
|
||||
}
|
||||
|
||||
struct Compiler<'r> {
|
||||
insts: Vec<Inst>,
|
||||
names: Vec<Option<~str>>,
|
||||
}
|
||||
|
||||
// The compiler implemented here is extremely simple. Most of the complexity
|
||||
// in this crate is in the parser or the VM.
|
||||
// The only tricky thing here is patching jump/split instructions to point to
|
||||
// the right instruction.
|
||||
impl<'r> Compiler<'r> {
|
||||
fn compile(&mut self, ast: ~parse::Ast) {
|
||||
match ast {
|
||||
~Nothing => {},
|
||||
~Literal(c, flags) => self.push(OneChar(c, flags)),
|
||||
~Dot(nl) => self.push(Any(nl)),
|
||||
~Class(ranges, flags) =>
|
||||
self.push(CharClass(ranges, flags)),
|
||||
~Begin(flags) => self.push(EmptyBegin(flags)),
|
||||
~End(flags) => self.push(EmptyEnd(flags)),
|
||||
~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
|
||||
~Capture(cap, name, x) => {
|
||||
let len = self.names.len();
|
||||
if cap >= len {
|
||||
self.names.grow(10 + cap - len, &None)
|
||||
}
|
||||
*self.names.get_mut(cap) = name;
|
||||
|
||||
self.push(Save(2 * cap));
|
||||
self.compile(x);
|
||||
self.push(Save(2 * cap + 1));
|
||||
}
|
||||
~Cat(xs) => {
|
||||
for x in xs.move_iter() {
|
||||
self.compile(x)
|
||||
}
|
||||
}
|
||||
~Alt(x, y) => {
|
||||
let split = self.empty_split(); // push: split 0, 0
|
||||
let j1 = self.insts.len();
|
||||
self.compile(x); // push: insts for x
|
||||
let jmp = self.empty_jump(); // push: jmp 0
|
||||
let j2 = self.insts.len();
|
||||
self.compile(y); // push: insts for y
|
||||
let j3 = self.insts.len();
|
||||
|
||||
self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2
|
||||
self.set_jump(jmp, j3); // jmp 0 -> jmp j3
|
||||
}
|
||||
~Rep(x, ZeroOne, g) => {
|
||||
let split = self.empty_split();
|
||||
let j1 = self.insts.len();
|
||||
self.compile(x);
|
||||
let j2 = self.insts.len();
|
||||
|
||||
if g.is_greedy() {
|
||||
self.set_split(split, j1, j2);
|
||||
} else {
|
||||
self.set_split(split, j2, j1);
|
||||
}
|
||||
}
|
||||
~Rep(x, ZeroMore, g) => {
|
||||
let j1 = self.insts.len();
|
||||
let split = self.empty_split();
|
||||
let j2 = self.insts.len();
|
||||
self.compile(x);
|
||||
let jmp = self.empty_jump();
|
||||
let j3 = self.insts.len();
|
||||
|
||||
self.set_jump(jmp, j1);
|
||||
if g.is_greedy() {
|
||||
self.set_split(split, j2, j3);
|
||||
} else {
|
||||
self.set_split(split, j3, j2);
|
||||
}
|
||||
}
|
||||
~Rep(x, OneMore, g) => {
|
||||
let j1 = self.insts.len();
|
||||
self.compile(x);
|
||||
let split = self.empty_split();
|
||||
let j2 = self.insts.len();
|
||||
|
||||
if g.is_greedy() {
|
||||
self.set_split(split, j1, j2);
|
||||
} else {
|
||||
self.set_split(split, j2, j1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends the given instruction to the program.
|
||||
#[inline]
|
||||
fn push(&mut self, x: Inst) {
|
||||
self.insts.push(x)
|
||||
}
|
||||
|
||||
/// Appends an *empty* `Split` instruction to the program and returns
|
||||
/// the index of that instruction. (The index can then be used to "patch"
|
||||
/// the actual locations of the split in later.)
|
||||
#[inline]
|
||||
fn empty_split(&mut self) -> InstIdx {
|
||||
self.insts.push(Split(0, 0));
|
||||
self.insts.len() - 1
|
||||
}
|
||||
|
||||
/// Sets the left and right locations of a `Split` instruction at index
|
||||
/// `i` to `pc1` and `pc2`, respectively.
|
||||
/// If the instruction at index `i` isn't a `Split` instruction, then
|
||||
/// `fail!` is called.
|
||||
#[inline]
|
||||
fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
|
||||
let split = self.insts.get_mut(i);
|
||||
match *split {
|
||||
Split(_, _) => *split = Split(pc1, pc2),
|
||||
_ => fail!("BUG: Invalid split index."),
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends an *empty* `Jump` instruction to the program and returns the
|
||||
/// index of that instruction.
|
||||
#[inline]
|
||||
fn empty_jump(&mut self) -> InstIdx {
|
||||
self.insts.push(Jump(0));
|
||||
self.insts.len() - 1
|
||||
}
|
||||
|
||||
/// Sets the location of a `Jump` instruction at index `i` to `pc`.
|
||||
/// If the instruction at index `i` isn't a `Jump` instruction, then
|
||||
/// `fail!` is called.
|
||||
#[inline]
|
||||
fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
|
||||
let jmp = self.insts.get_mut(i);
|
||||
match *jmp {
|
||||
Jump(_) => *jmp = Jump(pc),
|
||||
_ => fail!("BUG: Invalid jump index."),
|
||||
}
|
||||
}
|
||||
}
|
425
src/libregex/lib.rs
Normal file
425
src/libregex/lib.rs
Normal file
@ -0,0 +1,425 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! This crate provides a native implementation of regular expressions that is
|
||||
//! heavily based on RE2 both in syntax and in implementation. Notably,
|
||||
//! backreferences and arbitrary lookahead/lookbehind assertions are not
|
||||
//! provided. In return, regular expression searching provided by this package
|
||||
//! has excellent worst case performance. The specific syntax supported is
|
||||
//! documented further down.
|
||||
//!
|
||||
//! This crate's documentation provides some simple examples, describes Unicode
|
||||
//! support and exhaustively lists the supported syntax. For more specific
|
||||
//! details on the API, please see the documentation for the `Regex` type.
|
||||
//!
|
||||
//! # First example: find a date
|
||||
//!
|
||||
//! General use of regular expressions in this package involves compiling an
|
||||
//! expression and then using it to search, split or replace text. For example,
|
||||
//! to confirm that some text resembles a date:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use regex::Regex;
|
||||
//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") {
|
||||
//! Ok(re) => re,
|
||||
//! Err(err) => fail!("{}", err),
|
||||
//! };
|
||||
//! assert_eq!(re.is_match("2014-01-01"), true);
|
||||
//! ```
|
||||
//!
|
||||
//! Notice the use of the `^` and `$` anchors. In this crate, every expression
|
||||
//! is executed with an implicit `.*?` at the beginning and end, which allows
|
||||
//! it to match anywhere in the text. Anchors can be used to ensure that the
|
||||
//! full text matches an expression.
|
||||
//!
|
||||
//! This example also demonstrates the utility of raw strings in Rust, which
|
||||
//! are just like regular strings except they are prefixed with an `r` and do
|
||||
//! not process any escape sequences. For example, `"\\d"` is the same
|
||||
//! expression as `r"\d"`.
|
||||
//!
|
||||
//! # The `regex!` macro
|
||||
//!
|
||||
//! Rust's compile time meta-programming facilities provide a way to write a
|
||||
//! `regex!` macro which compiles regular expressions *when your program
|
||||
//! compiles*. Said differently, if you only use `regex!` to build regular
|
||||
//! expressions in your program, then your program cannot compile with an
|
||||
//! invalid regular expression. Moreover, the `regex!` macro compiles the
|
||||
//! given expression to native Rust code, which makes it much faster for
|
||||
//! searching text.
|
||||
//!
|
||||
//! Since `regex!` provides compiled regular expressions that are both safer
|
||||
//! and faster to use, you should use them whenever possible. The only
|
||||
//! requirement for using them is that you have a string literal corresponding
|
||||
//! to your expression. Otherwise, it is indistinguishable from an expression
|
||||
//! compiled at runtime with `Regex::new`.
|
||||
//!
|
||||
//! To use the `regex!` macro, you must enable the `phase` feature and import
|
||||
//! the `regex_macros` crate as a syntax extension:
|
||||
//!
|
||||
//! ```rust
|
||||
//! #![feature(phase)]
|
||||
//! #[phase(syntax)]
|
||||
//! extern crate regex_macros;
|
||||
//! extern crate regex;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! let re = regex!(r"^\d{4}-\d{2}-\d{2}$");
|
||||
//! assert_eq!(re.is_match("2014-01-01"), true);
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! There are a few things worth mentioning about using the `regex!` macro.
|
||||
//! Firstly, the `regex!` macro *only* accepts string *literals*.
|
||||
//! Secondly, the `regex` crate *must* be linked with the name `regex` since
|
||||
//! the generated code depends on finding symbols in the `regex` crate.
|
||||
//!
|
||||
//! The only downside of using the `regex!` macro is that it can increase the
|
||||
//! size of your program's binary since it generates specialized Rust code.
|
||||
//! The extra size probably won't be significant for a small number of
|
||||
//! expressions, but 100+ calls to `regex!` will probably result in a
|
||||
//! noticeably bigger binary.
|
||||
//!
|
||||
//! # Example: iterating over capture groups
|
||||
//!
|
||||
//! This crate provides convenient iterators for matching an expression
|
||||
//! repeatedly against a search string to find successive non-overlapping
|
||||
//! matches. For example, to find all dates in a string and be able to access
|
||||
//! them by their component pieces:
|
||||
//!
|
||||
//! ```rust
|
||||
//! # #![feature(phase)]
|
||||
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
//! # fn main() {
|
||||
//! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})");
|
||||
//! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
|
||||
//! for cap in re.captures_iter(text) {
|
||||
//! println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1));
|
||||
//! }
|
||||
//! // Output:
|
||||
//! // Month: 03 Day: 14 Year: 2012
|
||||
//! // Month: 01 Day: 01 Year: 2013
|
||||
//! // Month: 07 Day: 05 Year: 2014
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Notice that the year is in the capture group indexed at `1`. This is
|
||||
//! because the *entire match* is stored in the capture group at index `0`.
|
||||
//!
|
||||
//! # Example: replacement with named capture groups
|
||||
//!
|
||||
//! Building on the previous example, perhaps we'd like to rearrange the date
|
||||
//! formats. This can be done with text replacement. But to make the code
|
||||
//! clearer, we can *name* our capture groups and use those names as variables
|
||||
//! in our replacement text:
|
||||
//!
|
||||
//! ```rust
|
||||
//! # #![feature(phase)]
|
||||
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
//! # fn main() {
|
||||
//! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})");
|
||||
//! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
|
||||
//! let after = re.replace_all(before, "$m/$d/$y");
|
||||
//! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014");
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! The `replace` methods are actually polymorphic in the replacement, which
|
||||
//! provides more flexibility than is seen here. (See the documentation for
|
||||
//! `Regex::replace` for more details.)
|
||||
//!
|
||||
//! # Pay for what you use
|
||||
//!
|
||||
//! With respect to searching text with a regular expression, there are three
|
||||
//! questions that can be asked:
|
||||
//!
|
||||
//! 1. Does the text match this expression?
|
||||
//! 2. If so, where does it match?
|
||||
//! 3. Where are the submatches?
|
||||
//!
|
||||
//! Generally speaking, this crate could provide a function to answer only #3,
|
||||
//! which would subsume #1 and #2 automatically. However, it can be
|
||||
//! significantly more expensive to compute the location of submatches, so it's
|
||||
//! best not to do it if you don't need to.
|
||||
//!
|
||||
//! Therefore, only use what you need. For example, don't use `find` if you
|
||||
//! only need to test if an expression matches a string. (Use `is_match`
|
||||
//! instead.)
|
||||
//!
|
||||
//! # Unicode
|
||||
//!
|
||||
//! This implementation executes regular expressions **only** on sequences of
|
||||
//! UTF8 codepoints while exposing match locations as byte indices.
|
||||
//!
|
||||
//! Currently, only naive case folding is supported. Namely, when matching
|
||||
//! case insensitively, the characters are first converted to their uppercase
|
||||
//! forms and then compared.
|
||||
//!
|
||||
//! Regular expressions themselves are also **only** interpreted as a sequence
|
||||
//! of UTF8 codepoints. This means you can embed Unicode characters directly
|
||||
//! into your expression:
|
||||
//!
|
||||
//! ```rust
|
||||
//! # #![feature(phase)]
|
||||
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
//! # fn main() {
|
||||
//! let re = regex!(r"(?i)Δ+");
|
||||
//! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Finally, Unicode general categories and scripts are available as character
|
||||
//! classes. For example, you can match a sequence of numerals, Greek or
|
||||
//! Cherokee letters:
|
||||
//!
|
||||
//! ```rust
|
||||
//! # #![feature(phase)]
|
||||
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
//! # fn main() {
|
||||
//! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+");
|
||||
//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23)));
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! # Syntax
|
||||
//!
|
||||
//! The syntax supported in this crate is almost in an exact correspondence
|
||||
//! with the syntax supported by RE2.
|
||||
//!
|
||||
//! ## Matching one character
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! . any character except new line (includes new line with s flag)
|
||||
//! [xyz] A character class matching either x, y or z.
|
||||
//! [^xyz] A character class matching any character except x, y and z.
|
||||
//! [a-z] A character class matching any character in range a-z.
|
||||
//! \d Perl character class ([0-9])
|
||||
//! \D Negated Perl character class ([^0-9])
|
||||
//! [:alpha:] ASCII character class ([A-Za-z])
|
||||
//! [:^alpha:] Negated ASCII character class ([^A-Za-z])
|
||||
//! \pN One letter name Unicode character class
|
||||
//! \p{Greek} Unicode character class (general category or script)
|
||||
//! \PN Negated one letter name Unicode character class
|
||||
//! \P{Greek} negated Unicode character class (general category or script)
|
||||
//! </pre>
|
||||
//!
|
||||
//! Any named character class may appear inside a bracketed `[...]` character
|
||||
//! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
|
||||
//! character.
|
||||
//!
|
||||
//! ## Composites
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! xy concatenation (x followed by y)
|
||||
//! x|y alternation (x or y, prefer x)
|
||||
//! </pre>
|
||||
//!
|
||||
//! ## Repetitions
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! x* zero or more of x (greedy)
|
||||
//! x+ one or more of x (greedy)
|
||||
//! x? zero or one of x (greedy)
|
||||
//! x*? zero or more of x (ungreedy)
|
||||
//! x+? one or more of x (ungreedy)
|
||||
//! x?? zero or one of x (ungreedy)
|
||||
//! x{n,m} at least n and at most x (greedy)
|
||||
//! x{n,} at least n x (greedy)
|
||||
//! x{n} exactly n x
|
||||
//! x{n,m}? at least n and at most x (ungreedy)
|
||||
//! x{n,}? at least n x (ungreedy)
|
||||
//! x{n}? exactly n x
|
||||
//! </pre>
|
||||
//!
|
||||
//! ## Empty matches
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! ^ the beginning of text (or start-of-line with multi-line mode)
|
||||
//! $ the end of text (or end-of-line with multi-line mode)
|
||||
//! \A only the beginning of text (even with multi-line mode enabled)
|
||||
//! \z only the end of text (even with multi-line mode enabled)
|
||||
//! \b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
|
||||
//! \B not a Unicode word boundary
|
||||
//! </pre>
|
||||
//!
|
||||
//! ## Grouping and flags
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! (exp) numbered capture group (indexed by opening parenthesis)
|
||||
//! (?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
|
||||
//! (?:exp) non-capturing group
|
||||
//! (?flags) set flags within current group
|
||||
//! (?flags:exp) set flags for exp (non-capturing)
|
||||
//! </pre>
|
||||
//!
|
||||
//! Flags are each a single character. For example, `(?x)` sets the flag `x`
|
||||
//! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
|
||||
//! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
|
||||
//! the `x` flag and clears the `y` flag.
|
||||
//!
|
||||
//! All flags are by default disabled. They are:
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! i case insensitive
|
||||
//! m multi-line mode: ^ and $ match begin/end of line
|
||||
//! s allow . to match \n
|
||||
//! U swap the meaning of x* and x*?
|
||||
//! </pre>
|
||||
//!
|
||||
//! Here's an example that matches case insensitively for only part of the
|
||||
//! expression:
|
||||
//!
|
||||
//! ```rust
|
||||
//! # #![feature(phase)]
|
||||
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
//! # fn main() {
|
||||
//! let re = regex!(r"(?i)a+(?-i)b+");
|
||||
//! let cap = re.captures("AaAaAbbBBBb").unwrap();
|
||||
//! assert_eq!(cap.at(0), "AaAaAbb");
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
|
||||
//! `b`.
|
||||
//!
|
||||
//! ## Escape sequences
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! \* literal *, works for any punctuation character: \.+*?()|[]{}^$
|
||||
//! \a bell (\x07)
|
||||
//! \f form feed (\x0C)
|
||||
//! \t horizontal tab
|
||||
//! \n new line
|
||||
//! \r carriage return
|
||||
//! \v vertical tab (\x0B)
|
||||
//! \123 octal character code (up to three digits)
|
||||
//! \x7F hex character code (exactly two digits)
|
||||
//! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint
|
||||
//! </pre>
|
||||
//!
|
||||
//! ## Perl character classes (Unicode friendly)
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! \d digit ([0-9] + \p{Nd})
|
||||
//! \D not digit
|
||||
//! \s whitespace ([\t\n\f\r ] + \p{Z})
|
||||
//! \S not whitespace
|
||||
//! \w word character ([0-9A-Za-z_] + \p{L})
|
||||
//! \W not word character
|
||||
//! </pre>
|
||||
//!
|
||||
//! ## ASCII character classes
|
||||
//!
|
||||
//! <pre class="rust">
|
||||
//! [:alnum:] alphanumeric ([0-9A-Za-z])
|
||||
//! [:alpha:] alphabetic ([A-Za-z])
|
||||
//! [:ascii:] ASCII ([\x00-\x7F])
|
||||
//! [:blank:] blank ([\t ])
|
||||
//! [:cntrl:] control ([\x00-\x1F\x7F])
|
||||
//! [:digit:] digits ([0-9])
|
||||
//! [:graph:] graphical ([!-~])
|
||||
//! [:lower:] lower case ([a-z])
|
||||
//! [:print:] printable ([ -~])
|
||||
//! [:punct:] punctuation ([!-/:-@[-`{-~])
|
||||
//! [:space:] whitespace ([\t\n\v\f\r ])
|
||||
//! [:upper:] upper case ([A-Z])
|
||||
//! [:word:] word characters ([0-9A-Za-z_])
|
||||
//! [:xdigit:] hex digit ([0-9A-Fa-f])
|
||||
//! </pre>
|
||||
//!
|
||||
//! # Untrusted input
|
||||
//!
|
||||
//! There are two factors to consider here: untrusted regular expressions and
|
||||
//! untrusted search text.
|
||||
//!
|
||||
//! Currently, there are no counter-measures in place to prevent a malicious
|
||||
//! user from writing an expression that may use a lot of resources. One such
|
||||
//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try
|
||||
//! to repeat the `a` instruction `100^3` times. Essentially, this means it's
|
||||
//! very easy for an attacker to exhaust your system's memory if they are
|
||||
//! allowed to execute arbitrary regular expressions. A possible solution to
|
||||
//! this is to impose a hard limit on the size of a compiled expression, but it
|
||||
//! does not yet exist.
|
||||
//!
|
||||
//! The story is a bit better with untrusted search text, since this crate's
|
||||
//! implementation provides `O(nm)` search where `n` is the number of
|
||||
//! characters in the search text and `m` is the number of instructions in a
|
||||
//! compiled expression.
|
||||
|
||||
#![crate_id = "regex#0.11-pre"]
|
||||
#![crate_type = "rlib"]
|
||||
#![crate_type = "dylib"]
|
||||
#![experimental]
|
||||
#![license = "MIT/ASL2"]
|
||||
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
|
||||
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
|
||||
html_root_url = "http://static.rust-lang.org/doc/master")]
|
||||
|
||||
#![feature(macro_rules, phase)]
|
||||
#![deny(missing_doc)]
|
||||
|
||||
extern crate collections;
|
||||
#[cfg(test)]
|
||||
extern crate stdtest = "test";
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
|
||||
// During tests, this links with the `regex` crate so that the `regex!` macro
|
||||
// can be tested.
|
||||
#[cfg(test)]
|
||||
extern crate regex;
|
||||
|
||||
pub use parse::Error;
|
||||
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
|
||||
pub use re::{FindCaptures, FindMatches};
|
||||
pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN};
|
||||
pub use re::{quote, is_match};
|
||||
|
||||
mod compile;
|
||||
mod parse;
|
||||
mod re;
|
||||
mod vm;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
/// The `program` module exists to support the `regex!` macro. Do not use.
|
||||
#[doc(hidden)]
|
||||
pub mod native {
|
||||
// Exporting this stuff is bad form, but it's necessary for two reasons.
|
||||
// Firstly, the `regex!` syntax extension is in a different crate and
|
||||
// requires access to the representation of a regex (particularly the
|
||||
// instruction set) in order to compile to native Rust. This could be
|
||||
// mitigated if `regex!` was defined in the same crate, but this has
|
||||
// undesirable consequences (such as requiring a dependency on
|
||||
// `libsyntax`).
|
||||
//
|
||||
// Secondly, the code generated generated by `regex!` must *also* be able
|
||||
// to access various functions in this crate to reduce code duplication
|
||||
// and to provide a value with precisely the same `Regex` type in this
|
||||
// crate. This, AFAIK, is impossible to mitigate.
|
||||
//
|
||||
// On the bright side, `rustdoc` lets us hide this from the public API
|
||||
// documentation.
|
||||
pub use compile::{
|
||||
Program,
|
||||
OneChar, CharClass, Any, Save, Jump, Split,
|
||||
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
|
||||
};
|
||||
pub use parse::{
|
||||
FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL,
|
||||
FLAG_SWAP_GREED, FLAG_NEGATED,
|
||||
};
|
||||
pub use re::{Dynamic, Native};
|
||||
pub use vm::{
|
||||
MatchKind, Exists, Location, Submatches,
|
||||
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
|
||||
CharReader, find_prefix,
|
||||
};
|
||||
}
|
1028
src/libregex/parse.rs
Normal file
1028
src/libregex/parse.rs
Normal file
File diff suppressed because it is too large
Load Diff
870
src/libregex/re.rs
Normal file
870
src/libregex/re.rs
Normal file
@ -0,0 +1,870 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::from_str::from_str;
|
||||
use std::str::{MaybeOwned, Owned, Slice};
|
||||
|
||||
use compile::Program;
|
||||
use parse;
|
||||
use vm;
|
||||
use vm::{CaptureLocs, MatchKind, Exists, Location, Submatches};
|
||||
|
||||
/// Escapes all regular expression meta characters in `text` so that it may be
|
||||
/// safely used in a regular expression as a literal string.
|
||||
pub fn quote(text: &str) -> ~str {
|
||||
let mut quoted = StrBuf::with_capacity(text.len());
|
||||
for c in text.chars() {
|
||||
if parse::is_punct(c) {
|
||||
quoted.push_char('\\')
|
||||
}
|
||||
quoted.push_char(c);
|
||||
}
|
||||
quoted.into_owned()
|
||||
}
|
||||
|
||||
/// Tests if the given regular expression matches somewhere in the text given.
|
||||
///
|
||||
/// If there was a problem compiling the regular expression, an error is
|
||||
/// returned.
|
||||
///
|
||||
/// To find submatches, split or replace text, you'll need to compile an
|
||||
/// expression first.
|
||||
///
|
||||
/// Note that you should prefer the `regex!` macro when possible. For example,
|
||||
/// `regex!("...").is_match("...")`.
|
||||
pub fn is_match(regex: &str, text: &str) -> Result<bool, parse::Error> {
|
||||
Regex::new(regex).map(|r| r.is_match(text))
|
||||
}
|
||||
|
||||
/// Regex is a compiled regular expression, represented as either a sequence
|
||||
/// of bytecode instructions (dynamic) or as a specialized Rust function
|
||||
/// (native). It can be used to search, split
|
||||
/// or replace text. All searching is done with an implicit `.*?` at the
|
||||
/// beginning and end of an expression. To force an expression to match the
|
||||
/// whole string (or a prefix or a suffix), you must use an anchor like `^` or
|
||||
/// `$` (or `\A` and `\z`).
|
||||
///
|
||||
/// While this crate will handle Unicode strings (whether in the regular
|
||||
/// expression or in the search text), all positions returned are **byte
|
||||
/// indices**. Every byte index is guaranteed to be at a UTF8 codepoint
|
||||
/// boundary.
|
||||
///
|
||||
/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
|
||||
/// compiled regular expression and text to search, respectively.
|
||||
///
|
||||
/// The only methods that allocate new strings are the string replacement
|
||||
/// methods. All other methods (searching and splitting) return borrowed
|
||||
/// pointers into the string given.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Find the location of a US phone number:
|
||||
///
|
||||
/// ```rust
|
||||
/// # use regex::Regex;
|
||||
/// let re = match Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}") {
|
||||
/// Ok(re) => re,
|
||||
/// Err(err) => fail!("{}", err),
|
||||
/// };
|
||||
/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19)));
|
||||
/// ```
|
||||
///
|
||||
/// You can also use the `regex!` macro to compile a regular expression when
|
||||
/// you compile your program:
|
||||
///
|
||||
/// ```rust
|
||||
/// #![feature(phase)]
|
||||
/// extern crate regex;
|
||||
/// #[phase(syntax)] extern crate regex_macros;
|
||||
///
|
||||
/// fn main() {
|
||||
/// let re = regex!(r"\d+");
|
||||
/// assert_eq!(re.find("123 abc"), Some((0, 3)));
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// Given an incorrect regular expression, `regex!` will cause the Rust
|
||||
/// compiler to produce a compile time error.
|
||||
/// Note that `regex!` will compile the expression to native Rust code, which
|
||||
/// makes it much faster when searching text.
|
||||
/// More details about the `regex!` macro can be found in the `regex` crate
|
||||
/// documentation.
|
||||
#[deriving(Clone)]
|
||||
#[allow(visible_private_types)]
|
||||
pub struct Regex {
|
||||
/// The representation of `Regex` is exported to support the `regex!`
|
||||
/// syntax extension. Do not rely on it.
|
||||
///
|
||||
/// See the comments for the `program` module in `lib.rs` for a more
|
||||
/// detailed explanation for what `regex!` requires.
|
||||
#[doc(hidden)]
|
||||
pub original: ~str,
|
||||
#[doc(hidden)]
|
||||
pub names: ~[Option<~str>],
|
||||
#[doc(hidden)]
|
||||
pub p: MaybeNative,
|
||||
}
|
||||
|
||||
impl fmt::Show for Regex {
|
||||
/// Shows the original regular expression.
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f.buf, "{}", self.original)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum MaybeNative {
|
||||
Dynamic(Program),
|
||||
Native(fn(MatchKind, &str, uint, uint) -> Vec<Option<uint>>),
|
||||
}
|
||||
|
||||
impl Clone for MaybeNative {
|
||||
fn clone(&self) -> MaybeNative {
|
||||
match *self {
|
||||
Dynamic(ref p) => Dynamic(p.clone()),
|
||||
Native(fp) => Native(fp),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Regex {
|
||||
/// Compiles a dynamic regular expression. Once compiled, it can be
|
||||
/// used repeatedly to search, split or replace text in a string.
|
||||
///
|
||||
/// When possible, you should prefer the `regex!` macro since it is
|
||||
/// safer and always faster.
|
||||
///
|
||||
/// If an invalid expression is given, then an error is returned.
|
||||
pub fn new(re: &str) -> Result<Regex, parse::Error> {
|
||||
let ast = try!(parse::parse(re));
|
||||
let (prog, names) = Program::new(ast);
|
||||
Ok(Regex { original: re.to_owned(), names: names, p: Dynamic(prog) })
|
||||
}
|
||||
|
||||
/// Returns true if and only if the regex matches the string given.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Test if some text contains at least one word with exactly 13
|
||||
/// characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let text = "I categorically deny having triskaidekaphobia.";
|
||||
/// let matched = regex!(r"\b\w{13}\b").is_match(text);
|
||||
/// assert!(matched);
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn is_match(&self, text: &str) -> bool {
|
||||
has_match(&exec(self, Exists, text))
|
||||
}
|
||||
|
||||
/// Returns the start and end byte range of the leftmost-first match in
|
||||
/// `text`. If no match exists, then `None` is returned.
|
||||
///
|
||||
/// Note that this should only be used if you want to discover the position
|
||||
/// of the match. Testing the existence of a match is faster if you use
|
||||
/// `is_match`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Find the start and end location of every word with exactly 13
|
||||
/// characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let text = "I categorically deny having triskaidekaphobia.";
|
||||
/// let pos = regex!(r"\b\w{13}\b").find(text);
|
||||
/// assert_eq!(pos, Some((2, 15)));
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn find(&self, text: &str) -> Option<(uint, uint)> {
|
||||
let caps = exec(self, Location, text);
|
||||
if has_match(&caps) {
|
||||
Some((caps.get(0).unwrap(), caps.get(1).unwrap()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator for each successive non-overlapping match in
|
||||
/// `text`, returning the start and end byte indices with respect to
|
||||
/// `text`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Find the start and end location of the first word with exactly 13
|
||||
/// characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let text = "Retroactively relinquishing remunerations is reprehensible.";
|
||||
/// for pos in regex!(r"\b\w{13}\b").find_iter(text) {
|
||||
/// println!("{}", pos);
|
||||
/// }
|
||||
/// // Output:
|
||||
/// // (0, 13)
|
||||
/// // (14, 27)
|
||||
/// // (28, 41)
|
||||
/// // (45, 58)
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
search: text,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the capture groups corresponding to the leftmost-first
|
||||
/// match in `text`. Capture group `0` always corresponds to the entire
|
||||
/// match. If no match is found, then `None` is returned.
|
||||
///
|
||||
/// You should only use `captures` if you need access to submatches.
|
||||
/// Otherwise, `find` is faster for discovering the location of the overall
|
||||
/// match.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Say you have some text with movie names and their release years,
|
||||
/// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
|
||||
/// looking like that, while also extracting the movie name and its release
|
||||
/// year separately.
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"'([^']+)'\s+\((\d{4})\)");
|
||||
/// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
|
||||
/// let caps = re.captures(text).unwrap();
|
||||
/// assert_eq!(caps.at(1), "Citizen Kane");
|
||||
/// assert_eq!(caps.at(2), "1941");
|
||||
/// assert_eq!(caps.at(0), "'Citizen Kane' (1941)");
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Note that the full match is at capture group `0`. Each subsequent
|
||||
/// capture group is indexed by the order of its opening `(`.
|
||||
///
|
||||
/// We can make this example a bit clearer by using *named* capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
|
||||
/// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
|
||||
/// let caps = re.captures(text).unwrap();
|
||||
/// assert_eq!(caps.name("title"), "Citizen Kane");
|
||||
/// assert_eq!(caps.name("year"), "1941");
|
||||
/// assert_eq!(caps.at(0), "'Citizen Kane' (1941)");
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Here we name the capture groups, which we can access with the `name`
|
||||
/// method. Note that the named capture groups are still accessible with
|
||||
/// `at`.
|
||||
///
|
||||
/// The `0`th capture group is always unnamed, so it must always be
|
||||
/// accessed with `at(0)`.
|
||||
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
|
||||
let caps = exec(self, Submatches, text);
|
||||
Captures::new(self, text, caps)
|
||||
}
|
||||
|
||||
/// Returns an iterator over all the non-overlapping capture groups matched
|
||||
/// in `text`. This is operationally the same as `find_iter` (except it
|
||||
/// yields information about submatches).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// We can use this to find all movie titles and their release years in
|
||||
/// some text, where the movie is formatted like "'Title' (xxxx)":
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
|
||||
/// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
|
||||
/// for caps in re.captures_iter(text) {
|
||||
/// println!("Movie: {}, Released: {}", caps.name("title"), caps.name("year"));
|
||||
/// }
|
||||
/// // Output:
|
||||
/// // Movie: Citizen Kane, Released: 1941
|
||||
/// // Movie: The Wizard of Oz, Released: 1939
|
||||
/// // Movie: M, Released: 1931
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn captures_iter<'r, 't>(&'r self, text: &'t str)
|
||||
-> FindCaptures<'r, 't> {
|
||||
FindCaptures {
|
||||
re: self,
|
||||
search: text,
|
||||
last_match: None,
|
||||
last_end: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator of substrings of `text` delimited by a match
|
||||
/// of the regular expression.
|
||||
/// Namely, each element of the iterator corresponds to text that *isn't*
|
||||
/// matched by the regular expression.
|
||||
///
|
||||
/// This method will *not* copy the text given.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// To split a string delimited by arbitrary amounts of spaces or tabs:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"[ \t]+");
|
||||
/// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
|
||||
/// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
|
||||
RegexSplits {
|
||||
finder: self.find_iter(text),
|
||||
last: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator of at most `limit` substrings of `text` delimited
|
||||
/// by a match of the regular expression. (A `limit` of `0` will return no
|
||||
/// substrings.)
|
||||
/// Namely, each element of the iterator corresponds to text that *isn't*
|
||||
/// matched by the regular expression.
|
||||
/// The remainder of the string that is not split will be the last element
|
||||
/// in the iterator.
|
||||
///
|
||||
/// This method will *not* copy the text given.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Get the first two words in some text:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"\W+");
|
||||
/// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
|
||||
/// assert_eq!(fields, vec!("Hey", "How", "are you?"));
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: uint)
|
||||
-> RegexSplitsN<'r, 't> {
|
||||
RegexSplitsN {
|
||||
splits: self.split(text),
|
||||
cur: 0,
|
||||
limit: limit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Replaces the leftmost-first match with the replacement provided.
|
||||
/// The replacement can be a regular string (where `$N` and `$name` are
|
||||
/// expanded to match capture groups) or a function that takes the matches'
|
||||
/// `Captures` and returns the replaced string.
|
||||
///
|
||||
/// If no match is found, then a copy of the string is returned unchanged.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Note that this function is polymorphic with respect to the replacement.
|
||||
/// In typical usage, this can just be a normal string:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!("[^01]+");
|
||||
/// assert_eq!(re.replace("1078910", "").as_slice(), "1010");
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// But anything satisfying the `Replacer` trait will work. For example,
|
||||
/// a closure of type `|&Captures| -> ~str` provides direct access to the
|
||||
/// captures corresponding to a match. This allows one to access
|
||||
/// submatches easily:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # use regex::Captures; fn main() {
|
||||
/// let re = regex!(r"([^,\s]+),\s+(\S+)");
|
||||
/// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
|
||||
/// format!("{} {}", caps.at(2), caps.at(1))
|
||||
/// });
|
||||
/// assert_eq!(result.as_slice(), "Bruce Springsteen");
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// But this is a bit cumbersome to use all the time. Instead, a simple
|
||||
/// syntax is supported that expands `$name` into the corresponding capture
|
||||
/// group. Here's the last example, but using this expansion technique
|
||||
/// with named capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// let re = regex!(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)");
|
||||
/// let result = re.replace("Springsteen, Bruce", "$first $last");
|
||||
/// assert_eq!(result.as_slice(), "Bruce Springsteen");
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Note that using `$2` instead of `$first` or `$1` instead of `$last`
|
||||
/// would produce the same result. To write a literal `$` use `$$`.
|
||||
///
|
||||
/// Finally, sometimes you just want to replace a literal string with no
|
||||
/// submatch expansion. This can be done by wrapping a string with
|
||||
/// `NoExpand`:
|
||||
///
|
||||
/// ```rust
|
||||
/// # #![feature(phase)]
|
||||
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
|
||||
/// # fn main() {
|
||||
/// use regex::NoExpand;
|
||||
///
|
||||
/// let re = regex!(r"(?P<last>[^,\s]+),\s+(\S+)");
|
||||
/// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
|
||||
/// assert_eq!(result.as_slice(), "$2 $last");
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn replace<R: Replacer>(&self, text: &str, rep: R) -> StrBuf {
|
||||
self.replacen(text, 1, rep)
|
||||
}
|
||||
|
||||
/// Replaces all non-overlapping matches in `text` with the
|
||||
/// replacement provided. This is the same as calling `replacen` with
|
||||
/// `limit` set to `0`.
|
||||
///
|
||||
/// See the documentation for `replace` for details on how to access
|
||||
/// submatches in the replacement string.
|
||||
pub fn replace_all<R: Replacer>(&self, text: &str, rep: R) -> StrBuf {
|
||||
self.replacen(text, 0, rep)
|
||||
}
|
||||
|
||||
/// Replaces at most `limit` non-overlapping matches in `text` with the
|
||||
/// replacement provided. If `limit` is 0, then all non-overlapping matches
|
||||
/// are replaced.
|
||||
///
|
||||
/// See the documentation for `replace` for details on how to access
|
||||
/// submatches in the replacement string.
|
||||
pub fn replacen<R: Replacer>
|
||||
(&self, text: &str, limit: uint, mut rep: R) -> StrBuf {
|
||||
let mut new = StrBuf::with_capacity(text.len());
|
||||
let mut last_match = 0u;
|
||||
let mut i = 0;
|
||||
for cap in self.captures_iter(text) {
|
||||
// It'd be nicer to use the 'take' iterator instead, but it seemed
|
||||
// awkward given that '0' => no limit.
|
||||
if limit > 0 && i >= limit {
|
||||
break
|
||||
}
|
||||
i += 1;
|
||||
|
||||
let (s, e) = cap.pos(0).unwrap(); // captures only reports matches
|
||||
new.push_str(text.slice(last_match, s));
|
||||
new.push_str(rep.reg_replace(&cap).as_slice());
|
||||
last_match = e;
|
||||
}
|
||||
new.append(text.slice(last_match, text.len()))
|
||||
}
|
||||
}
|
||||
|
||||
/// NoExpand indicates literal string replacement.
|
||||
///
|
||||
/// It can be used with `replace` and `replace_all` to do a literal
|
||||
/// string replacement without expanding `$name` to their corresponding
|
||||
/// capture groups.
|
||||
///
|
||||
/// `'r` is the lifetime of the literal text.
|
||||
pub struct NoExpand<'t>(pub &'t str);
|
||||
|
||||
/// Replacer describes types that can be used to replace matches in a string.
|
||||
pub trait Replacer {
|
||||
/// Returns a possibly owned string that is used to replace the match
|
||||
/// corresponding the the `caps` capture group.
|
||||
///
|
||||
/// The `'a` lifetime refers to the lifetime of a borrowed string when
|
||||
/// a new owned string isn't needed (e.g., for `NoExpand`).
|
||||
fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a>;
|
||||
}
|
||||
|
||||
impl<'t> Replacer for NoExpand<'t> {
|
||||
fn reg_replace<'a>(&'a mut self, _: &Captures) -> MaybeOwned<'a> {
|
||||
let NoExpand(s) = *self;
|
||||
Slice(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Replacer for &'t str {
|
||||
fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a> {
|
||||
Owned(caps.expand(*self).into_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for |&Captures|: 'a -> ~str {
|
||||
fn reg_replace<'r>(&'r mut self, caps: &Captures) -> MaybeOwned<'r> {
|
||||
Owned((*self)(caps).into_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// Yields all substrings delimited by a regular expression match.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
|
||||
/// of the string being split.
|
||||
pub struct RegexSplits<'r, 't> {
|
||||
finder: FindMatches<'r, 't>,
|
||||
last: uint,
|
||||
}
|
||||
|
||||
impl<'r, 't> Iterator<&'t str> for RegexSplits<'r, 't> {
|
||||
fn next(&mut self) -> Option<&'t str> {
|
||||
let text = self.finder.search;
|
||||
match self.finder.next() {
|
||||
None => {
|
||||
if self.last >= text.len() {
|
||||
None
|
||||
} else {
|
||||
let s = text.slice(self.last, text.len());
|
||||
self.last = text.len();
|
||||
Some(s)
|
||||
}
|
||||
}
|
||||
Some((s, e)) => {
|
||||
let matched = text.slice(self.last, s);
|
||||
self.last = e;
|
||||
Some(matched)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Yields at most `N` substrings delimited by a regular expression match.
|
||||
///
|
||||
/// The last substring will be whatever remains after splitting.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
|
||||
/// of the string being split.
|
||||
pub struct RegexSplitsN<'r, 't> {
|
||||
splits: RegexSplits<'r, 't>,
|
||||
cur: uint,
|
||||
limit: uint,
|
||||
}
|
||||
|
||||
impl<'r, 't> Iterator<&'t str> for RegexSplitsN<'r, 't> {
|
||||
fn next(&mut self) -> Option<&'t str> {
|
||||
let text = self.splits.finder.search;
|
||||
if self.cur >= self.limit {
|
||||
None
|
||||
} else {
|
||||
self.cur += 1;
|
||||
if self.cur >= self.limit {
|
||||
Some(text.slice(self.splits.last, text.len()))
|
||||
} else {
|
||||
self.splits.next()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Captures represents a group of captured strings for a single match.
|
||||
///
|
||||
/// The 0th capture always corresponds to the entire match. Each subsequent
|
||||
/// index corresponds to the next capture group in the regex.
|
||||
/// If a capture group is named, then the matched string is *also* available
|
||||
/// via the `name` method. (Note that the 0th capture is always unnamed and so
|
||||
/// must be accessed with the `at` method.)
|
||||
///
|
||||
/// Positions returned from a capture group are always byte indices.
|
||||
///
|
||||
/// `'t` is the lifetime of the matched text.
|
||||
pub struct Captures<'t> {
|
||||
text: &'t str,
|
||||
locs: CaptureLocs,
|
||||
named: Option<HashMap<~str, uint>>,
|
||||
}
|
||||
|
||||
impl<'t> Captures<'t> {
|
||||
fn new(re: &Regex, search: &'t str, locs: CaptureLocs)
|
||||
-> Option<Captures<'t>> {
|
||||
if !has_match(&locs) {
|
||||
return None
|
||||
}
|
||||
|
||||
let named =
|
||||
if re.names.len() == 0 {
|
||||
None
|
||||
} else {
|
||||
let mut named = HashMap::new();
|
||||
for (i, name) in re.names.iter().enumerate() {
|
||||
match name {
|
||||
&None => {},
|
||||
&Some(ref name) => {
|
||||
named.insert(name.to_owned(), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(named)
|
||||
};
|
||||
Some(Captures {
|
||||
text: search,
|
||||
locs: locs,
|
||||
named: named,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the start and end positions of the Nth capture group.
|
||||
/// Returns `None` if `i` is not a valid capture group or if the capture
|
||||
/// group did not match anything.
|
||||
/// The positions returned are *always* byte indices with respect to the
|
||||
/// original string matched.
|
||||
pub fn pos(&self, i: uint) -> Option<(uint, uint)> {
|
||||
let (s, e) = (i * 2, i * 2 + 1);
|
||||
if e >= self.locs.len() || self.locs.get(s).is_none() {
|
||||
// VM guarantees that each pair of locations are both Some or None.
|
||||
return None
|
||||
}
|
||||
Some((self.locs.get(s).unwrap(), self.locs.get(e).unwrap()))
|
||||
}
|
||||
|
||||
/// Returns the matched string for the capture group `i`.
|
||||
/// If `i` isn't a valid capture group or didn't match anything, then the
|
||||
/// empty string is returned.
|
||||
pub fn at(&self, i: uint) -> &'t str {
|
||||
match self.pos(i) {
|
||||
None => "",
|
||||
Some((s, e)) => {
|
||||
self.text.slice(s, e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the matched string for the capture group named `name`.
|
||||
/// If `name` isn't a valid capture group or didn't match anything, then
|
||||
/// the empty string is returned.
|
||||
pub fn name(&self, name: &str) -> &'t str {
|
||||
match self.named {
|
||||
None => "",
|
||||
Some(ref h) => {
|
||||
match h.find_equiv(&name) {
|
||||
None => "",
|
||||
Some(i) => self.at(*i),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an iterator of all the capture groups in order of appearance
|
||||
/// in the regular expression.
|
||||
pub fn iter(&'t self) -> SubCaptures<'t> {
|
||||
SubCaptures { idx: 0, caps: self, }
|
||||
}
|
||||
|
||||
/// Creates an iterator of all the capture group positions in order of
|
||||
/// appearance in the regular expression. Positions are byte indices
|
||||
/// in terms of the original string matched.
|
||||
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
|
||||
SubCapturesPos { idx: 0, caps: self, }
|
||||
}
|
||||
|
||||
/// Expands all instances of `$name` in `text` to the corresponding capture
|
||||
/// group `name`.
|
||||
///
|
||||
/// `name` may be an integer corresponding to the index of the
|
||||
/// capture group (counted by order of opening parenthesis where `0` is the
|
||||
/// entire match) or it can be a name (consisting of letters, digits or
|
||||
/// underscores) corresponding to a named capture group.
|
||||
///
|
||||
/// If `name` isn't a valid capture group (whether the name doesn't exist or
|
||||
/// isn't a valid index), then it is replaced with the empty string.
|
||||
///
|
||||
/// To write a literal `$` use `$$`.
|
||||
pub fn expand(&self, text: &str) -> StrBuf {
|
||||
// How evil can you get?
|
||||
// FIXME: Don't use regexes for this. It's completely unnecessary.
|
||||
let re = Regex::new(r"(^|[^$]|\b)\$(\w+)").unwrap();
|
||||
let text = re.replace_all(text, |refs: &Captures| -> ~str {
|
||||
let (pre, name) = (refs.at(1), refs.at(2));
|
||||
pre + match from_str::<uint>(name) {
|
||||
None => self.name(name).to_owned(),
|
||||
Some(i) => self.at(i).to_owned(),
|
||||
}
|
||||
});
|
||||
let re = Regex::new(r"\$\$").unwrap();
|
||||
re.replace_all(text.as_slice(), NoExpand("$"))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Container for Captures<'t> {
|
||||
/// Returns the number of captured groups.
|
||||
#[inline]
|
||||
fn len(&self) -> uint {
|
||||
self.locs.len() / 2
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over capture groups for a particular match of a regular
|
||||
/// expression.
|
||||
///
|
||||
/// `'t` is the lifetime of the matched text.
|
||||
pub struct SubCaptures<'t> {
|
||||
idx: uint,
|
||||
caps: &'t Captures<'t>,
|
||||
}
|
||||
|
||||
impl<'t> Iterator<&'t str> for SubCaptures<'t> {
|
||||
fn next(&mut self) -> Option<&'t str> {
|
||||
if self.idx < self.caps.len() {
|
||||
self.idx += 1;
|
||||
Some(self.caps.at(self.idx - 1))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over capture group positions for a particular match of a
|
||||
/// regular expression.
|
||||
///
|
||||
/// Positions are byte indices in terms of the original string matched.
|
||||
///
|
||||
/// `'t` is the lifetime of the matched text.
|
||||
pub struct SubCapturesPos<'t> {
|
||||
idx: uint,
|
||||
caps: &'t Captures<'t>,
|
||||
}
|
||||
|
||||
impl<'t> Iterator<Option<(uint, uint)>> for SubCapturesPos<'t> {
|
||||
fn next(&mut self) -> Option<Option<(uint, uint)>> {
|
||||
if self.idx < self.caps.len() {
|
||||
self.idx += 1;
|
||||
Some(self.caps.pos(self.idx - 1))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that yields all non-overlapping capture groups matching a
|
||||
/// particular regular expression. The iterator stops when no more matches can
|
||||
/// be found.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
|
||||
/// of the matched string.
|
||||
pub struct FindCaptures<'r, 't> {
|
||||
re: &'r Regex,
|
||||
search: &'t str,
|
||||
last_match: Option<uint>,
|
||||
last_end: uint,
|
||||
}
|
||||
|
||||
impl<'r, 't> Iterator<Captures<'t>> for FindCaptures<'r, 't> {
|
||||
fn next(&mut self) -> Option<Captures<'t>> {
|
||||
if self.last_end > self.search.len() {
|
||||
return None
|
||||
}
|
||||
|
||||
let caps = exec_slice(self.re, Submatches, self.search,
|
||||
self.last_end, self.search.len());
|
||||
let (s, e) =
|
||||
if !has_match(&caps) {
|
||||
return None
|
||||
} else {
|
||||
(caps.get(0).unwrap(), caps.get(1).unwrap())
|
||||
};
|
||||
|
||||
// Don't accept empty matches immediately following a match.
|
||||
// i.e., no infinite loops please.
|
||||
if e - s == 0 && Some(self.last_end) == self.last_match {
|
||||
self.last_end += 1;
|
||||
return self.next()
|
||||
}
|
||||
self.last_end = e;
|
||||
self.last_match = Some(self.last_end);
|
||||
Captures::new(self.re, self.search, caps)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping matches for a particular string.
|
||||
///
|
||||
/// The iterator yields a tuple of integers corresponding to the start and end
|
||||
/// of the match. The indices are byte offsets. The iterator stops when no more
|
||||
/// matches can be found.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
|
||||
/// of the matched string.
|
||||
pub struct FindMatches<'r, 't> {
|
||||
re: &'r Regex,
|
||||
search: &'t str,
|
||||
last_match: Option<uint>,
|
||||
last_end: uint,
|
||||
}
|
||||
|
||||
impl<'r, 't> Iterator<(uint, uint)> for FindMatches<'r, 't> {
|
||||
fn next(&mut self) -> Option<(uint, uint)> {
|
||||
if self.last_end > self.search.len() {
|
||||
return None
|
||||
}
|
||||
|
||||
let caps = exec_slice(self.re, Location, self.search,
|
||||
self.last_end, self.search.len());
|
||||
let (s, e) =
|
||||
if !has_match(&caps) {
|
||||
return None
|
||||
} else {
|
||||
(caps.get(0).unwrap(), caps.get(1).unwrap())
|
||||
};
|
||||
|
||||
// Don't accept empty matches immediately following a match.
|
||||
// i.e., no infinite loops please.
|
||||
if e - s == 0 && Some(self.last_end) == self.last_match {
|
||||
self.last_end += 1;
|
||||
return self.next()
|
||||
}
|
||||
self.last_end = e;
|
||||
self.last_match = Some(self.last_end);
|
||||
Some((s, e))
|
||||
}
|
||||
}
|
||||
|
||||
fn exec(re: &Regex, which: MatchKind, input: &str) -> CaptureLocs {
|
||||
exec_slice(re, which, input, 0, input.len())
|
||||
}
|
||||
|
||||
fn exec_slice(re: &Regex, which: MatchKind,
|
||||
input: &str, s: uint, e: uint) -> CaptureLocs {
|
||||
match re.p {
|
||||
Dynamic(ref prog) => vm::run(which, prog, input, s, e),
|
||||
Native(exec) => exec(which, input, s, e),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn has_match(caps: &CaptureLocs) -> bool {
|
||||
caps.len() >= 2 && caps.get(0).is_some() && caps.get(1).is_some()
|
||||
}
|
179
src/libregex/test/bench.rs
Normal file
179
src/libregex/test/bench.rs
Normal file
@ -0,0 +1,179 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use rand::{Rng, task_rng};
|
||||
use stdtest::Bencher;
|
||||
use std::str;
|
||||
use regex::{Regex, NoExpand};
|
||||
|
||||
fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) {
|
||||
b.iter(|| if !re.is_match(text) { fail!("no match") });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn no_exponential(b: &mut Bencher) {
|
||||
let n = 100;
|
||||
let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap();
|
||||
let text = "a".repeat(n);
|
||||
bench_assert_match(b, re, text);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn literal(b: &mut Bencher) {
|
||||
let re = regex!("y");
|
||||
let text = "x".repeat(50) + "y";
|
||||
bench_assert_match(b, re, text);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn not_literal(b: &mut Bencher) {
|
||||
let re = regex!(".y");
|
||||
let text = "x".repeat(50) + "y";
|
||||
bench_assert_match(b, re, text);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn match_class(b: &mut Bencher) {
|
||||
let re = regex!("[abcdw]");
|
||||
let text = "xxxx".repeat(20) + "w";
|
||||
bench_assert_match(b, re, text);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn match_class_in_range(b: &mut Bencher) {
|
||||
// 'b' is between 'a' and 'c', so the class range checking doesn't help.
|
||||
let re = regex!("[ac]");
|
||||
let text = "bbbb".repeat(20) + "c";
|
||||
bench_assert_match(b, re, text);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn replace_all(b: &mut Bencher) {
|
||||
let re = regex!("[cjrw]");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz";
|
||||
// FIXME: This isn't using the $name expand stuff.
|
||||
// It's possible RE2/Go is using it, but currently, the expand in this
|
||||
// crate is actually compiling a regex, so it's incredibly slow.
|
||||
b.iter(|| re.replace_all(text, NoExpand("")));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn anchored_literal_short_non_match(b: &mut Bencher) {
|
||||
let re = regex!("^zbc(d|e)");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn anchored_literal_long_non_match(b: &mut Bencher) {
|
||||
let re = regex!("^zbc(d|e)");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn anchored_literal_short_match(b: &mut Bencher) {
|
||||
let re = regex!("^.bc(d|e)");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn anchored_literal_long_match(b: &mut Bencher) {
|
||||
let re = regex!("^.bc(d|e)");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_short_a(b: &mut Bencher) {
|
||||
let re = regex!("^.bc(d|e)*$");
|
||||
let text = "abcddddddeeeededd";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_short_a_not(b: &mut Bencher) {
|
||||
let re = regex!(".bc(d|e)*$");
|
||||
let text = "abcddddddeeeededd";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_short_b(b: &mut Bencher) {
|
||||
let re = regex!("^.bc(?:d|e)*$");
|
||||
let text = "abcddddddeeeededd";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_short_b_not(b: &mut Bencher) {
|
||||
let re = regex!(".bc(?:d|e)*$");
|
||||
let text = "abcddddddeeeededd";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_long_prefix(b: &mut Bencher) {
|
||||
let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn one_pass_long_prefix_not(b: &mut Bencher) {
|
||||
let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$");
|
||||
let text = "abcdefghijklmnopqrstuvwxyz";
|
||||
b.iter(|| re.is_match(text));
|
||||
}
|
||||
|
||||
macro_rules! throughput(
|
||||
($name:ident, $regex:expr, $size:expr) => (
|
||||
#[bench]
|
||||
fn $name(b: &mut Bencher) {
|
||||
let text = gen_text($size);
|
||||
b.bytes = $size;
|
||||
b.iter(|| if $regex.is_match(text) { fail!("match") });
|
||||
}
|
||||
);
|
||||
)
|
||||
|
||||
fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
|
||||
fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") }
|
||||
fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
|
||||
fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
|
||||
|
||||
fn gen_text(n: uint) -> ~str {
|
||||
let mut rng = task_rng();
|
||||
let mut bytes = rng.gen_ascii_str(n).into_bytes();
|
||||
for (i, b) in bytes.mut_iter().enumerate() {
|
||||
if i % 20 == 0 {
|
||||
*b = '\n' as u8
|
||||
}
|
||||
}
|
||||
str::from_utf8(bytes).unwrap().to_owned()
|
||||
}
|
||||
|
||||
throughput!(easy0_32, easy0(), 32)
|
||||
throughput!(easy0_1K, easy0(), 1<<10)
|
||||
throughput!(easy0_32K, easy0(), 32<<10)
|
||||
|
||||
throughput!(easy1_32, easy1(), 32)
|
||||
throughput!(easy1_1K, easy1(), 1<<10)
|
||||
throughput!(easy1_32K, easy1(), 32<<10)
|
||||
|
||||
throughput!(medium_32, medium(), 32)
|
||||
throughput!(medium_1K, medium(), 1<<10)
|
||||
throughput!(medium_32K,medium(), 32<<10)
|
||||
|
||||
throughput!(hard_32, hard(), 32)
|
||||
throughput!(hard_1K, hard(), 1<<10)
|
||||
throughput!(hard_32K,hard(), 32<<10)
|
||||
|
373
src/libregex/test/matches.rs
Normal file
373
src/libregex/test/matches.rs
Normal file
@ -0,0 +1,373 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// ignore-tidy-linelength
|
||||
|
||||
// DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests'
|
||||
// on 2014-04-23 01:33:36.539280.
|
||||
|
||||
// Tests from basic.dat
|
||||
mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18)))
|
||||
mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7)))
|
||||
mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8)))
|
||||
mat!(match_basic_6, r"\)", r"()", Some((1, 2)))
|
||||
mat!(match_basic_7, r"a]", r"a]a", Some((0, 2)))
|
||||
mat!(match_basic_9, r"\}", r"}", Some((0, 1)))
|
||||
mat!(match_basic_10, r"\]", r"]", Some((0, 1)))
|
||||
mat!(match_basic_12, r"]", r"]", Some((0, 1)))
|
||||
mat!(match_basic_15, r"^a", r"ax", Some((0, 1)))
|
||||
mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3)))
|
||||
mat!(match_basic_17, r"a\^", r"a^", Some((0, 2)))
|
||||
mat!(match_basic_18, r"a$", r"aa", Some((1, 2)))
|
||||
mat!(match_basic_19, r"a\$", r"a$", Some((0, 2)))
|
||||
mat!(match_basic_20, r"^$", r"", Some((0, 0)))
|
||||
mat!(match_basic_21, r"$^", r"", Some((0, 0)))
|
||||
mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2)))
|
||||
mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0)))
|
||||
mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4)))
|
||||
mat!(match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3)))
|
||||
mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2)))
|
||||
mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2)))
|
||||
mat!(match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
|
||||
mat!(match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
|
||||
mat!(match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4)))
|
||||
mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4)))
|
||||
mat!(match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2)))
|
||||
mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1)))
|
||||
mat!(match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2)))
|
||||
mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2)))
|
||||
mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2)))
|
||||
mat!(match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2)))
|
||||
mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
|
||||
mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
|
||||
mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2)))
|
||||
mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8)))
|
||||
mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9)))
|
||||
mat!(match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2)))
|
||||
mat!(match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2)))
|
||||
mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3)))
|
||||
mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4)))
|
||||
mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4)))
|
||||
mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3)))
|
||||
mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3)))
|
||||
mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4)))
|
||||
mat!(match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17)))
|
||||
mat!(match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17)))
|
||||
mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1)))
|
||||
mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3)))
|
||||
mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3)))
|
||||
mat!(match_basic_65, r"
|
||||
", r"
|
||||
", Some((0, 1)))
|
||||
mat!(match_basic_66, r"
|
||||
", r"
|
||||
", Some((0, 1)))
|
||||
mat!(match_basic_67, r"[^a]", r"
|
||||
", Some((0, 1)))
|
||||
mat!(match_basic_68, r"
|
||||
a", r"
|
||||
a", Some((0, 2)))
|
||||
mat!(match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3)))
|
||||
mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3)))
|
||||
mat!(match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6)))
|
||||
mat!(match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3)))
|
||||
mat!(match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11)))
|
||||
mat!(match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2)))
|
||||
mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81)))
|
||||
mat!(match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25)))
|
||||
mat!(match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22)))
|
||||
mat!(match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11)))
|
||||
mat!(match_basic_80, r".*", r"", Some((0, 2)))
|
||||
mat!(match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57)))
|
||||
mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10)))
|
||||
mat!(match_basic_84, r"^", r"", Some((0, 0)))
|
||||
mat!(match_basic_85, r"$", r"", Some((0, 0)))
|
||||
mat!(match_basic_86, r"^$", r"", Some((0, 0)))
|
||||
mat!(match_basic_87, r"^a$", r"a", Some((0, 1)))
|
||||
mat!(match_basic_88, r"abc", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4)))
|
||||
mat!(match_basic_90, r"abc", r"ababc", Some((2, 5)))
|
||||
mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4)))
|
||||
mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6)))
|
||||
mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4)))
|
||||
mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6)))
|
||||
mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4)))
|
||||
mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3)))
|
||||
mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4)))
|
||||
mat!(match_basic_103, r"^", r"abc", Some((0, 0)))
|
||||
mat!(match_basic_104, r"$", r"abc", Some((3, 3)))
|
||||
mat!(match_basic_105, r"a.c", r"abc", Some((0, 3)))
|
||||
mat!(match_basic_106, r"a.c", r"axc", Some((0, 3)))
|
||||
mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5)))
|
||||
mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3)))
|
||||
mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3)))
|
||||
mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3)))
|
||||
mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2)))
|
||||
mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2)))
|
||||
mat!(match_basic_113, r"a]", r"a]", Some((0, 2)))
|
||||
mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3)))
|
||||
mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3)))
|
||||
mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3)))
|
||||
mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3)))
|
||||
mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2)))
|
||||
mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2)))
|
||||
mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3)))
|
||||
mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2)))
|
||||
mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4)))
|
||||
mat!(match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3)))
|
||||
mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7)))
|
||||
mat!(match_basic_126, r"a*", r"aaa", Some((0, 3)))
|
||||
mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None)
|
||||
mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0)))
|
||||
mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None)
|
||||
mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2)))
|
||||
mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2)))
|
||||
mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3)))
|
||||
mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None)
|
||||
mat!(match_basic_138, r"a*", r"", Some((0, 0)))
|
||||
mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5)))
|
||||
mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1)))
|
||||
mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1)))
|
||||
mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1)))
|
||||
mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None)
|
||||
mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7)))
|
||||
mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3)))
|
||||
mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2)))
|
||||
mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4)))
|
||||
mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3)))
|
||||
mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2)))
|
||||
mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1)))
|
||||
mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3)))
|
||||
mat!(match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
|
||||
mat!(match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
|
||||
mat!(match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4)))
|
||||
mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7)))
|
||||
mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2)))
|
||||
mat!(match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4)))
|
||||
mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5)))
|
||||
mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3)))
|
||||
mat!(match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5)))
|
||||
mat!(match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2)))
|
||||
mat!(match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6)))
|
||||
mat!(match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14)))
|
||||
mat!(match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5)))
|
||||
mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4)))
|
||||
mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3)))
|
||||
mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3)))
|
||||
mat!(match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12)))
|
||||
mat!(match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12)))
|
||||
mat!(match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12)))
|
||||
mat!(match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11)))
|
||||
mat!(match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12)))
|
||||
mat!(match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11)))
|
||||
mat!(match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15)))
|
||||
mat!(match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12)))
|
||||
mat!(match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11)))
|
||||
mat!(match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13)))
|
||||
mat!(match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12)))
|
||||
mat!(match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12)))
|
||||
mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4)))
|
||||
mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4)))
|
||||
mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4)))
|
||||
mat!(match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19)))
|
||||
mat!(match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
|
||||
mat!(match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11)))
|
||||
mat!(match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3)))
|
||||
mat!(match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7)))
|
||||
mat!(match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
|
||||
mat!(match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
|
||||
mat!(match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
|
||||
mat!(match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
|
||||
mat!(match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
|
||||
mat!(match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7)))
|
||||
mat!(match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
|
||||
mat!(match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
|
||||
mat!(match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
|
||||
mat!(match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
|
||||
mat!(match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
|
||||
mat!(match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
|
||||
mat!(match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
|
||||
mat!(match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3)))
|
||||
mat!(match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
|
||||
mat!(match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
|
||||
mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4)))
|
||||
mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4)))
|
||||
mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4)))
|
||||
mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4)))
|
||||
mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4)))
|
||||
mat!(match_basic_221, r"\\000", r"\000", Some((0, 4)))
|
||||
|
||||
// Tests from nullsubexpr.dat
|
||||
mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None)
|
||||
mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0)))
|
||||
mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0)))
|
||||
mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_17, r"(a+)+", r"x", None)
|
||||
mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None)
|
||||
mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0)))
|
||||
mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None)
|
||||
mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5)))
|
||||
mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None)
|
||||
mat!(match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6)))
|
||||
mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None)
|
||||
mat!(match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1)))
|
||||
mat!(match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
|
||||
mat!(match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2)))
|
||||
mat!(match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2)))
|
||||
|
||||
// Tests from repetition.dat
|
||||
mat!(match_repetition_10, r"((..)|(.))", r"", None)
|
||||
mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None)
|
||||
mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None)
|
||||
mat!(match_repetition_14, r"((..)|(.)){1}", r"", None)
|
||||
mat!(match_repetition_15, r"((..)|(.)){2}", r"", None)
|
||||
mat!(match_repetition_16, r"((..)|(.)){3}", r"", None)
|
||||
mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0)))
|
||||
mat!(match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
|
||||
mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None)
|
||||
mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None)
|
||||
mat!(match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
|
||||
mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None)
|
||||
mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None)
|
||||
mat!(match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
|
||||
mat!(match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)))
|
||||
mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None)
|
||||
mat!(match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2)))
|
||||
mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None)
|
||||
mat!(match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)))
|
||||
mat!(match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3)))
|
||||
mat!(match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
|
||||
mat!(match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3)))
|
||||
mat!(match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
|
||||
mat!(match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4)))
|
||||
mat!(match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4)))
|
||||
mat!(match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5)))
|
||||
mat!(match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
|
||||
mat!(match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
|
||||
mat!(match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None)
|
||||
mat!(match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
|
||||
mat!(match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
|
||||
mat!(match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
|
||||
mat!(match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
|
||||
mat!(match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
|
||||
mat!(match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
|
||||
mat!(match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
|
||||
mat!(match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
|
||||
mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None)
|
||||
mat!(match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
|
||||
mat!(match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
|
||||
mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None)
|
||||
mat!(match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
|
||||
mat!(match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None)
|
||||
mat!(match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None)
|
||||
mat!(match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
mat!(match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
|
||||
|
29
src/libregex/test/mod.rs
Normal file
29
src/libregex/test/mod.rs
Normal file
@ -0,0 +1,29 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
#[cfg(not(stage1))]
|
||||
#[phase(syntax)]
|
||||
extern crate regex_macros;
|
||||
|
||||
// Dirty hack: During stage1, test dynamic regexs. For stage2, we test
|
||||
// native regexs.
|
||||
#[cfg(stage1)]
|
||||
macro_rules! regex(
|
||||
($re:expr) => (
|
||||
match ::regex::Regex::new($re) {
|
||||
Ok(re) => re,
|
||||
Err(err) => fail!("{}", err),
|
||||
}
|
||||
);
|
||||
)
|
||||
|
||||
mod bench;
|
||||
mod tests;
|
||||
|
199
src/libregex/test/tests.rs
Normal file
199
src/libregex/test/tests.rs
Normal file
@ -0,0 +1,199 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// ignore-tidy-linelength
|
||||
|
||||
use regex::{Regex, NoExpand};
|
||||
|
||||
#[test]
|
||||
fn splitn() {
|
||||
let re = regex!(r"\d+");
|
||||
let text = "cauchy123plato456tyler789binx";
|
||||
let subs: Vec<&str> = re.splitn(text, 2).collect();
|
||||
assert_eq!(subs, vec!("cauchy", "plato456tyler789binx"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split() {
|
||||
let re = regex!(r"\d+");
|
||||
let text = "cauchy123plato456tyler789binx";
|
||||
let subs: Vec<&str> = re.split(text).collect();
|
||||
assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx"));
|
||||
}
|
||||
|
||||
macro_rules! replace(
|
||||
($name:ident, $which:ident, $re:expr,
|
||||
$search:expr, $replace:expr, $result:expr) => (
|
||||
#[test]
|
||||
fn $name() {
|
||||
let re = regex!($re);
|
||||
assert_eq!(re.$which($search, $replace), StrBuf::from_str($result));
|
||||
}
|
||||
);
|
||||
)
|
||||
|
||||
replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6")
|
||||
replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z")
|
||||
replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ")
|
||||
replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1")
|
||||
replace!(rep_double_dollar, replace,
|
||||
r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1")
|
||||
replace!(rep_no_expand, replace,
|
||||
r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1")
|
||||
replace!(rep_named, replace_all,
|
||||
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
|
||||
"w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3")
|
||||
replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
|
||||
"", "trim me")
|
||||
|
||||
macro_rules! noparse(
|
||||
($name:ident, $re:expr) => (
|
||||
#[test]
|
||||
fn $name() {
|
||||
let re = $re;
|
||||
match Regex::new(re) {
|
||||
Err(_) => {},
|
||||
Ok(_) => fail!("Regex '{}' should cause a parse error.", re),
|
||||
}
|
||||
}
|
||||
);
|
||||
)
|
||||
|
||||
noparse!(fail_double_repeat, "a**")
|
||||
noparse!(fail_no_repeat_arg, "*")
|
||||
noparse!(fail_no_repeat_arg_begin, "^*")
|
||||
noparse!(fail_incomplete_escape, "\\")
|
||||
noparse!(fail_class_incomplete, "[A-")
|
||||
noparse!(fail_class_not_closed, "[A")
|
||||
noparse!(fail_class_no_begin, r"[\A]")
|
||||
noparse!(fail_class_no_end, r"[\z]")
|
||||
noparse!(fail_class_no_boundary, r"[\b]")
|
||||
noparse!(fail_open_paren, "(")
|
||||
noparse!(fail_close_paren, ")")
|
||||
noparse!(fail_invalid_range, "[a-Z]")
|
||||
noparse!(fail_empty_capture_name, "(?P<>a)")
|
||||
noparse!(fail_empty_capture_exp, "(?P<name>)")
|
||||
noparse!(fail_bad_capture_name, "(?P<na-me>)")
|
||||
noparse!(fail_bad_flag, "(?a)a")
|
||||
noparse!(fail_empty_alt_before, "|a")
|
||||
noparse!(fail_empty_alt_after, "a|")
|
||||
noparse!(fail_counted_big_exact, "a{1001}")
|
||||
noparse!(fail_counted_big_min, "a{1001,}")
|
||||
noparse!(fail_counted_no_close, "a{1001")
|
||||
noparse!(fail_unfinished_cap, "(?")
|
||||
noparse!(fail_unfinished_escape, "\\")
|
||||
noparse!(fail_octal_digit, r"\8")
|
||||
noparse!(fail_hex_digit, r"\xG0")
|
||||
noparse!(fail_hex_short, r"\xF")
|
||||
noparse!(fail_hex_long_digits, r"\x{fffg}")
|
||||
noparse!(fail_flag_bad, "(?a)")
|
||||
noparse!(fail_flag_empty, "(?)")
|
||||
noparse!(fail_double_neg, "(?-i-i)")
|
||||
noparse!(fail_neg_empty, "(?i-)")
|
||||
noparse!(fail_empty_group, "()")
|
||||
noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)")
|
||||
|
||||
macro_rules! mat(
|
||||
($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
|
||||
#[test]
|
||||
fn $name() {
|
||||
let text = $text;
|
||||
let expected: Vec<Option<(uint, uint)>> = vec!($($loc)+);
|
||||
let r = regex!($re);
|
||||
let got = match r.captures(text) {
|
||||
Some(c) => c.iter_pos().collect::<Vec<Option<(uint, uint)>>>(),
|
||||
None => vec!(None),
|
||||
};
|
||||
// The test set sometimes leave out capture groups, so truncate
|
||||
// actual capture groups to match test set.
|
||||
let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice());
|
||||
if sgot.len() > sexpect.len() {
|
||||
sgot = sgot.slice(0, sexpect.len())
|
||||
}
|
||||
if sexpect != sgot {
|
||||
fail!("For RE '{}' against '{}', expected '{}' but got '{}'",
|
||||
$re, text, sexpect, sgot);
|
||||
}
|
||||
}
|
||||
);
|
||||
)
|
||||
|
||||
// Some crazy expressions from regular-expressions.info.
|
||||
mat!(match_ranges,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 255", Some((5, 8)))
|
||||
mat!(match_ranges_not,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 256", None)
|
||||
mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)))
|
||||
mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)))
|
||||
mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)))
|
||||
mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None)
|
||||
mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail.com ", Some((8, 26)))
|
||||
mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail ", None)
|
||||
mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
|
||||
"mine is jam.slam@gmail.com ", Some((8, 26)))
|
||||
mat!(match_date1,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-01-01", Some((0, 10)))
|
||||
mat!(match_date2,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-00-01", None)
|
||||
mat!(match_date3,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-13-01", None)
|
||||
|
||||
// Exercise the flags.
|
||||
mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3)))
|
||||
mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3)))
|
||||
mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None)
|
||||
mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2)))
|
||||
mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4)))
|
||||
mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None)
|
||||
mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2)))
|
||||
mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11)))
|
||||
mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)))
|
||||
mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)))
|
||||
mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)))
|
||||
|
||||
// Some Unicode tests.
|
||||
mat!(uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3)))
|
||||
mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)))
|
||||
mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)))
|
||||
mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)))
|
||||
mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)))
|
||||
mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)))
|
||||
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)))
|
||||
mat!(uni_case_not, r"Δ", "δ", None)
|
||||
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)))
|
||||
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)))
|
||||
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)))
|
||||
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
|
||||
|
||||
// Test the Unicode friendliness of Perl character classes.
|
||||
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
|
||||
mat!(uni_perl_w_not, r"\w+", "Ⅱ", None)
|
||||
mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3)))
|
||||
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
|
||||
mat!(uni_perl_d_not, r"\d+", "Ⅱ", None)
|
||||
mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)))
|
||||
mat!(uni_perl_s, r"\s+", " ", Some((0, 3)))
|
||||
mat!(uni_perl_s_not, r"\s+", "☃", None)
|
||||
mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)))
|
||||
|
||||
// And do the same for word boundaries.
|
||||
mat!(uni_boundary_none, r"\d\b", "6δ", None)
|
||||
mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)))
|
||||
|
||||
// A whole mess of tests from Glenn Fowler's regex test suite.
|
||||
// Generated by the 'src/etc/regex-match-tests' program.
|
||||
mod matches;
|
19
src/libregex/testdata/LICENSE
vendored
Normal file
19
src/libregex/testdata/LICENSE
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
The following license covers testregex.c and all associated test data.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||
Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following disclaimer:
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
17
src/libregex/testdata/README
vendored
Normal file
17
src/libregex/testdata/README
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
Test data was taken from the Go distribution, which was in turn taken from the
|
||||
testregex test suite:
|
||||
|
||||
http://www2.research.att.com/~astopen/testregex/testregex.html
|
||||
|
||||
The LICENSE in this directory corresponds to the LICENSE that the data was
|
||||
released under.
|
||||
|
||||
The tests themselves were modified for RE2/Go. A couple were modified further
|
||||
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
|
||||
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
|
||||
have been a bad idea, but I think being consistent with an established Regex
|
||||
library is worth something.
|
||||
|
||||
Note that these files are read by 'src/etc/regexp-match-tests' and turned into
|
||||
Rust tests found in 'src/libregexp/tests/matches.rs'.
|
||||
|
221
src/libregex/testdata/basic.dat
vendored
Normal file
221
src/libregex/testdata/basic.dat
vendored
Normal file
@ -0,0 +1,221 @@
|
||||
NOTE all standard compliant implementations should pass these : 2002-05-31
|
||||
|
||||
BE abracadabra$ abracadabracadabra (7,18)
|
||||
BE a...b abababbb (2,7)
|
||||
BE XXXXXX ..XXXXXX (2,8)
|
||||
E \) () (1,2)
|
||||
BE a] a]a (0,2)
|
||||
B } } (0,1)
|
||||
E \} } (0,1)
|
||||
BE \] ] (0,1)
|
||||
B ] ] (0,1)
|
||||
E ] ] (0,1)
|
||||
B { { (0,1)
|
||||
B } } (0,1)
|
||||
BE ^a ax (0,1)
|
||||
BE \^a a^a (1,3)
|
||||
BE a\^ a^ (0,2)
|
||||
BE a$ aa (1,2)
|
||||
BE a\$ a$ (0,2)
|
||||
BE ^$ NULL (0,0)
|
||||
E $^ NULL (0,0)
|
||||
E a($) aa (1,2)(2,2)
|
||||
E a*(^a) aa (0,1)(0,1)
|
||||
E (..)*(...)* a (0,0)
|
||||
E (..)*(...)* abcd (0,4)(2,4)
|
||||
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
|
||||
E (ab)c|abc abc (0,3)(0,2)
|
||||
E a{0}b ab (1,2)
|
||||
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E a{9876543210} NULL BADBR
|
||||
E ((a|a)|a) a (0,1)(0,1)(0,1)
|
||||
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
|
||||
E a*(a.|aa) aaaa (0,4)(2,4)
|
||||
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
|
||||
E (a|b)?.* b (0,1)(0,1)
|
||||
E (a|b)c|a(b|c) ac (0,2)(0,1)
|
||||
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
|
||||
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
|
||||
E (a|b)*c|(a|ab)*c xc (1,2)
|
||||
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
|
||||
E a?(ab|ba)ab abab (0,4)(0,2)
|
||||
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
|
||||
E ab|abab abbabab (0,2)
|
||||
E aba|bab|bba baaabbbaba (5,8)
|
||||
E aba|bab baaabbbaba (6,9)
|
||||
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
|
||||
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
|
||||
E ab|a xabc (1,3)
|
||||
E ab|a xxabc (2,4)
|
||||
Ei (Ab|cD)* aBcD (0,4)(2,4)
|
||||
BE [^-] --a (2,3)
|
||||
BE [a-]* --a (0,3)
|
||||
BE [a-m-]* --amoma-- (0,4)
|
||||
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
|
||||
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
|
||||
{E [[:upper:]] A (0,1) [[<element>]] not supported
|
||||
E [[:lower:]]+ `az{ (1,3)
|
||||
E [[:upper:]]+ @AZ[ (1,3)
|
||||
# No collation in Go
|
||||
#BE [[-]] [[-]] (2,4)
|
||||
#BE [[.NIL.]] NULL ECOLLATE
|
||||
#BE [[=aleph=]] NULL ECOLLATE
|
||||
}
|
||||
BE$ \n \n (0,1)
|
||||
BEn$ \n \n (0,1)
|
||||
BE$ [^a] \n (0,1)
|
||||
BE$ \na \na (0,2)
|
||||
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
|
||||
BE xxx xxx (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
|
||||
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
|
||||
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
|
||||
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
|
||||
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
|
||||
BE$ .* \x01\x7f (0,2)
|
||||
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
|
||||
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
|
||||
E a*a*a*a*a*b aaaaaaaaab (0,10)
|
||||
BE ^ NULL (0,0)
|
||||
BE $ NULL (0,0)
|
||||
BE ^$ NULL (0,0)
|
||||
BE ^a$ a (0,1)
|
||||
BE abc abc (0,3)
|
||||
BE abc xabcy (1,4)
|
||||
BE abc ababc (2,5)
|
||||
BE ab*c abc (0,3)
|
||||
BE ab*bc abc (0,3)
|
||||
BE ab*bc abbc (0,4)
|
||||
BE ab*bc abbbbc (0,6)
|
||||
E ab+bc abbc (0,4)
|
||||
E ab+bc abbbbc (0,6)
|
||||
E ab?bc abbc (0,4)
|
||||
E ab?bc abc (0,3)
|
||||
E ab?c abc (0,3)
|
||||
BE ^abc$ abc (0,3)
|
||||
BE ^abc abcc (0,3)
|
||||
BE abc$ aabc (1,4)
|
||||
BE ^ abc (0,0)
|
||||
BE $ abc (3,3)
|
||||
BE a.c abc (0,3)
|
||||
BE a.c axc (0,3)
|
||||
BE a.*c axyzc (0,5)
|
||||
BE a[bc]d abd (0,3)
|
||||
BE a[b-d]e ace (0,3)
|
||||
BE a[b-d] aac (1,3)
|
||||
BE a[-b] a- (0,2)
|
||||
BE a[b-] a- (0,2)
|
||||
BE a] a] (0,2)
|
||||
BE a[]]b a]b (0,3)
|
||||
BE a[^bc]d aed (0,3)
|
||||
BE a[^-b]c adc (0,3)
|
||||
BE a[^]b]c adc (0,3)
|
||||
E ab|cd abc (0,2)
|
||||
E ab|cd abcd (0,2)
|
||||
E a\(b a(b (0,3)
|
||||
E a\(*b ab (0,2)
|
||||
E a\(*b a((b (0,4)
|
||||
E ((a)) abc (0,1)(0,1)(0,1)
|
||||
E (a)b(c) abc (0,3)(0,1)(2,3)
|
||||
E a+b+c aabbabc (4,7)
|
||||
E a* aaa (0,3)
|
||||
#E (a*)* - (0,0)(0,0)
|
||||
E (a*)* - (0,0)(?,?) RE2/Go
|
||||
E (a*)+ - (0,0)(0,0)
|
||||
#E (a*|b)* - (0,0)(0,0)
|
||||
E (a*|b)* - (0,0)(?,?) RE2/Go
|
||||
E (a+|b)* ab (0,2)(1,2)
|
||||
E (a+|b)+ ab (0,2)(1,2)
|
||||
E (a+|b)? ab (0,1)(0,1)
|
||||
BE [^ab]* cde (0,3)
|
||||
#E (^)* - (0,0)(0,0)
|
||||
E (^)* - (0,0)(?,?) RE2/Go
|
||||
BE a* NULL (0,0)
|
||||
E ([abc])*d abbbcd (0,6)(4,5)
|
||||
E ([abc])*bcd abcd (0,4)(0,1)
|
||||
E a|b|c|d|e e (0,1)
|
||||
E (a|b|c|d|e)f ef (0,2)(0,1)
|
||||
#E ((a*|b))* - (0,0)(0,0)(0,0)
|
||||
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
|
||||
BE abcd*efg abcdefg (0,7)
|
||||
BE ab* xabyabbbz (1,3)
|
||||
BE ab* xayabbbz (1,2)
|
||||
E (ab|cd)e abcde (2,5)(2,4)
|
||||
BE [abhgefdc]ij hij (0,3)
|
||||
E (a|b)c*d abcd (1,4)(1,2)
|
||||
E (ab|ab*)bc abc (0,3)(0,1)
|
||||
E a([bc]*)c* abc (0,3)(1,3)
|
||||
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
|
||||
E a[bcd]*dcdcde adcdcde (0,7)
|
||||
E (ab|a)b*c abc (0,3)(0,2)
|
||||
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
|
||||
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
|
||||
E ^a(bc+|b[eh])g|.h$ abh (1,3)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
|
||||
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
|
||||
BE multiple words multiple words yeah (0,14)
|
||||
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
|
||||
BE abcd abcd (0,4)
|
||||
E a(bc)d abcd (0,4)(1,3)
|
||||
E a[-]?c ac (0,3)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
|
||||
E a+(b|c)*d+ aabcdd (0,6)(3,4)
|
||||
E ^.+$ vivi (0,4)
|
||||
E ^(.+)$ vivi (0,4)(0,4)
|
||||
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
|
||||
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
|
||||
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
|
||||
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
|
||||
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
|
||||
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
|
||||
E (foo|(bar))!bas foo!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas bar!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E (foo|bar)!bas foo!bas (0,7)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E .*(/XXX).* /XXX (0,4)(0,4)
|
||||
E .*(\\XXX).* \XXX (0,4)(0,4)
|
||||
E \\XXX \XXX (0,4)
|
||||
E .*(/000).* /000 (0,4)(0,4)
|
||||
E .*(\\000).* \000 (0,4)(0,4)
|
||||
E \\000 \000 (0,4)
|
79
src/libregex/testdata/nullsubexpr.dat
vendored
Normal file
79
src/libregex/testdata/nullsubexpr.dat
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
NOTE null subexpression matches : 2002-06-06
|
||||
|
||||
E (a*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)* a (0,1)(0,1)
|
||||
E SAME x (0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)+ a (0,1)(0,1)
|
||||
E SAME x NOMATCH
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
|
||||
E ([a]*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([a]*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([^b]*)* a (0,1)(0,1)
|
||||
#E SAME b (0,0)(0,0)
|
||||
E SAME b (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaab (0,6)(0,6)
|
||||
E ([ab]*)* a (0,1)(0,1)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME ababab (0,6)(0,6)
|
||||
E SAME bababa (0,6)(0,6)
|
||||
E SAME b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
E SAME aaaabcde (0,5)(0,5)
|
||||
E ([^a]*)* b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
#E SAME aaaaaa (0,0)(0,0)
|
||||
E SAME aaaaaa (0,0)(?,?) RE2/Go
|
||||
E ([^ab]*)* ccccxx (0,6)(0,6)
|
||||
#E SAME ababab (0,0)(0,0)
|
||||
E SAME ababab (0,0)(?,?) RE2/Go
|
||||
|
||||
E ((z)+|a)* zabcde (0,2)(1,2)
|
||||
|
||||
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
|
||||
#E (a) aaa (0,1)(0,1)
|
||||
#E (a*?) aaa (0,0)(0,0)
|
||||
#E (a)*? aaa (0,0)
|
||||
#E (a*?)*? aaa (0,0)
|
||||
#}
|
||||
|
||||
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
|
||||
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
|
||||
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
|
||||
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
|
||||
|
||||
#E (a*)*(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
|
||||
E (a*)*(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)*(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*)+(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)+(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)+(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*){2}(x) x (0,1)(0,0)(0,1)
|
||||
E (a*){2}(x) ax (0,2)(1,1)(1,2)
|
||||
E (a*){2}(x) axa (0,2)(1,1)(1,2)
|
163
src/libregex/testdata/repetition.dat
vendored
Normal file
163
src/libregex/testdata/repetition.dat
vendored
Normal file
@ -0,0 +1,163 @@
|
||||
NOTE implicit vs. explicit repetitions : 2009-02-02
|
||||
|
||||
# Glenn Fowler <gsf@research.att.com>
|
||||
# conforming matches (column 4) must match one of the following BREs
|
||||
# NOMATCH
|
||||
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
|
||||
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
|
||||
# i.e., each 3-tuple has two identical elements and one (?,?)
|
||||
|
||||
E ((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
|
||||
E ((..)|(.)){1} NULL NOMATCH
|
||||
E ((..)|(.)){2} NULL NOMATCH
|
||||
E ((..)|(.)){3} NULL NOMATCH
|
||||
|
||||
E ((..)|(.))* NULL (0,0)
|
||||
|
||||
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.))((..)|(.)) a NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
|
||||
|
||||
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.)){2} a NOMATCH
|
||||
E ((..)|(.)){3} a NOMATCH
|
||||
|
||||
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
|
||||
|
||||
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
|
||||
|
||||
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.)){3} aa NOMATCH
|
||||
|
||||
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
|
||||
|
||||
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
|
||||
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
|
||||
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
|
||||
|
||||
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
|
||||
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
|
||||
|
||||
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
|
||||
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
|
||||
|
||||
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
|
||||
# Linux/GLIBC gets the {8,} and {8,8} wrong.
|
||||
|
||||
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
|
||||
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
|
||||
|
||||
# These test a fixed bug in my regex-tdfa that did not keep the expanded
|
||||
# form properly grouped, so right association did the wrong thing with
|
||||
# these ambiguous patterns (crafted just to test my code when I became
|
||||
# suspicious of my implementation). The first subexpression should use
|
||||
# "ab" then "a" then "bcd".
|
||||
|
||||
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
|
||||
# results like (0,6)(4,5)(6,6).
|
||||
|
||||
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
|
||||
|
||||
# The above worked on Linux/GLIBC but the following often fail.
|
||||
# They also trip up OS X / FreeBSD / NetBSD:
|
||||
|
||||
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
5537
src/libregex/unicode.rs
Normal file
5537
src/libregex/unicode.rs
Normal file
File diff suppressed because it is too large
Load Diff
587
src/libregex/vm.rs
Normal file
587
src/libregex/vm.rs
Normal file
@ -0,0 +1,587 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// FIXME: Currently, the VM simulates an NFA. It would be nice to have another
|
||||
// VM that simulates a DFA.
|
||||
//
|
||||
// According to Russ Cox[1], a DFA performs better than an NFA, principally
|
||||
// because it reuses states previously computed by the machine *and* doesn't
|
||||
// keep track of capture groups. The drawback of a DFA (aside from its
|
||||
// complexity) is that it can't accurately return the locations of submatches.
|
||||
// The NFA *can* do that. (This is my understanding anyway.)
|
||||
//
|
||||
// Cox suggests that a DFA ought to be used to answer "does this match" and
|
||||
// "where does it match" questions. (In the latter, the starting position of
|
||||
// the match is computed by executing the regex backwards.) Cox also suggests
|
||||
// that a DFA should be run when asking "where are the submatches", which can
|
||||
// 1) quickly answer "no" is there's no match and 2) discover the substring
|
||||
// that matches, which means running the NFA on smaller input.
|
||||
//
|
||||
// Currently, the NFA simulation implemented below does some dirty tricks to
|
||||
// avoid tracking capture groups when they aren't needed (which only works
|
||||
// for 'is_match', not 'find'). This is a half-measure, but does provide some
|
||||
// perf improvement.
|
||||
//
|
||||
// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
|
||||
//
|
||||
// [1] - http://swtch.com/~rsc/regex/regex3.html
|
||||
|
||||
use std::cmp;
|
||||
use std::mem;
|
||||
use std::slice::MutableVector;
|
||||
use compile::{
|
||||
Program,
|
||||
Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
|
||||
Save, Jump, Split,
|
||||
};
|
||||
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
|
||||
use parse::unicode::PERLW;
|
||||
|
||||
pub type CaptureLocs = Vec<Option<uint>>;
|
||||
|
||||
/// Indicates the type of match to be performed by the VM.
|
||||
pub enum MatchKind {
|
||||
/// Only checks if a match exists or not. Does not return location.
|
||||
Exists,
|
||||
/// Returns the start and end indices of the entire match in the input
|
||||
/// given.
|
||||
Location,
|
||||
/// Returns the start and end indices of each submatch in the input given.
|
||||
Submatches,
|
||||
}
|
||||
|
||||
/// Runs an NFA simulation on the compiled expression given on the search text
|
||||
/// `input`. The search begins at byte index `start` and ends at byte index
|
||||
/// `end`. (The range is specified here so that zero-width assertions will work
|
||||
/// correctly when searching for successive non-overlapping matches.)
|
||||
///
|
||||
/// The `which` parameter indicates what kind of capture information the caller
|
||||
/// wants. There are three choices: match existence only, the location of the
|
||||
/// entire match or the locations of the entire match in addition to the
|
||||
/// locations of each submatch.
|
||||
pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
|
||||
start: uint, end: uint) -> CaptureLocs {
|
||||
Nfa {
|
||||
which: which,
|
||||
prog: prog,
|
||||
input: input,
|
||||
start: start,
|
||||
end: end,
|
||||
ic: 0,
|
||||
chars: CharReader::new(input),
|
||||
}.run()
|
||||
}
|
||||
|
||||
struct Nfa<'r, 't> {
|
||||
which: MatchKind,
|
||||
prog: &'r Program,
|
||||
input: &'t str,
|
||||
start: uint,
|
||||
end: uint,
|
||||
ic: uint,
|
||||
chars: CharReader<'t>,
|
||||
}
|
||||
|
||||
/// Indicates the next action to take after a single non-empty instruction
|
||||
/// is processed.
|
||||
pub enum StepState {
|
||||
/// This is returned if and only if a Match instruction is reached and
|
||||
/// we only care about the existence of a match. It instructs the VM to
|
||||
/// quit early.
|
||||
StepMatchEarlyReturn,
|
||||
/// Indicates that a match was found. Thus, the rest of the states in the
|
||||
/// *current* queue should be dropped (i.e., leftmost-first semantics).
|
||||
/// States in the "next" queue can still be processed.
|
||||
StepMatch,
|
||||
/// No match was found. Continue with the next state in the queue.
|
||||
StepContinue,
|
||||
}
|
||||
|
||||
impl<'r, 't> Nfa<'r, 't> {
|
||||
fn run(&mut self) -> CaptureLocs {
|
||||
let ncaps = match self.which {
|
||||
Exists => 0,
|
||||
Location => 1,
|
||||
Submatches => self.prog.num_captures(),
|
||||
};
|
||||
let mut matched = false;
|
||||
let ninsts = self.prog.insts.len();
|
||||
let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
|
||||
let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
|
||||
|
||||
let mut groups = Vec::from_elem(ncaps * 2, None);
|
||||
|
||||
// Determine if the expression starts with a '^' so we can avoid
|
||||
// simulating .*?
|
||||
// Make sure multi-line mode isn't enabled for it, otherwise we can't
|
||||
// drop the initial .*?
|
||||
let prefix_anchor =
|
||||
match *self.prog.insts.get(1) {
|
||||
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
self.ic = self.start;
|
||||
let mut next_ic = self.chars.set(self.start);
|
||||
while self.ic <= self.end {
|
||||
if clist.size == 0 {
|
||||
// We have a match and we're done exploring alternatives.
|
||||
// Time to quit.
|
||||
if matched {
|
||||
break
|
||||
}
|
||||
|
||||
// If there are no threads to try, then we'll have to start
|
||||
// over at the beginning of the regex.
|
||||
// BUT, if there's a literal prefix for the program, try to
|
||||
// jump ahead quickly. If it can't be found, then we can bail
|
||||
// out early.
|
||||
if self.prog.prefix.len() > 0 && clist.size == 0 {
|
||||
let needle = self.prog.prefix.as_slice().as_bytes();
|
||||
let haystack = self.input.as_bytes().slice_from(self.ic);
|
||||
match find_prefix(needle, haystack) {
|
||||
None => break,
|
||||
Some(i) => {
|
||||
self.ic += i;
|
||||
next_ic = self.chars.set(self.ic);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This simulates a preceding '.*?' for every regex by adding
|
||||
// a state starting at the current position in the input for the
|
||||
// beginning of the program only if we don't already have a match.
|
||||
if clist.size == 0 || (!prefix_anchor && !matched) {
|
||||
self.add(clist, 0, groups.as_mut_slice())
|
||||
}
|
||||
|
||||
// Now we try to read the next character.
|
||||
// As a result, the 'step' method will look at the previous
|
||||
// character.
|
||||
self.ic = next_ic;
|
||||
next_ic = self.chars.advance();
|
||||
|
||||
let mut i = 0;
|
||||
while i < clist.size {
|
||||
let pc = clist.pc(i);
|
||||
let step_state = self.step(groups.as_mut_slice(), nlist,
|
||||
clist.groups(i), pc);
|
||||
match step_state {
|
||||
StepMatchEarlyReturn => return vec![Some(0), Some(0)],
|
||||
StepMatch => { matched = true; clist.empty() },
|
||||
StepContinue => {},
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
mem::swap(&mut clist, &mut nlist);
|
||||
nlist.empty();
|
||||
}
|
||||
match self.which {
|
||||
Exists if matched => vec![Some(0), Some(0)],
|
||||
Exists => vec![None, None],
|
||||
Location | Submatches => groups,
|
||||
}
|
||||
}
|
||||
|
||||
fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
|
||||
caps: &mut [Option<uint>], pc: uint)
|
||||
-> StepState {
|
||||
match *self.prog.insts.get(pc) {
|
||||
Match => {
|
||||
match self.which {
|
||||
Exists => {
|
||||
return StepMatchEarlyReturn
|
||||
}
|
||||
Location => {
|
||||
groups[0] = caps[0];
|
||||
groups[1] = caps[1];
|
||||
return StepMatch
|
||||
}
|
||||
Submatches => {
|
||||
for (slot, val) in groups.mut_iter().zip(caps.iter()) {
|
||||
*slot = *val;
|
||||
}
|
||||
return StepMatch
|
||||
}
|
||||
}
|
||||
}
|
||||
OneChar(c, flags) => {
|
||||
if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
|
||||
self.add(nlist, pc+1, caps);
|
||||
}
|
||||
}
|
||||
CharClass(ref ranges, flags) => {
|
||||
if self.chars.prev.is_some() {
|
||||
let c = self.chars.prev.unwrap();
|
||||
let negate = flags & FLAG_NEGATED > 0;
|
||||
let casei = flags & FLAG_NOCASE > 0;
|
||||
let found = ranges.as_slice();
|
||||
let found = found.bsearch(|&rc| class_cmp(casei, c, rc));
|
||||
let found = found.is_some();
|
||||
if (found && !negate) || (!found && negate) {
|
||||
self.add(nlist, pc+1, caps);
|
||||
}
|
||||
}
|
||||
}
|
||||
Any(flags) => {
|
||||
if flags & FLAG_DOTNL > 0
|
||||
|| !self.char_eq(false, self.chars.prev, '\n') {
|
||||
self.add(nlist, pc+1, caps)
|
||||
}
|
||||
}
|
||||
EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
|
||||
| Save(_) | Jump(_) | Split(_, _) => {},
|
||||
}
|
||||
StepContinue
|
||||
}
|
||||
|
||||
fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
|
||||
if nlist.contains(pc) {
|
||||
return
|
||||
}
|
||||
// We have to add states to the threads list even if their empty.
|
||||
// TL;DR - It prevents cycles.
|
||||
// If we didn't care about cycles, we'd *only* add threads that
|
||||
// correspond to non-jumping instructions (OneChar, Any, Match, etc.).
|
||||
// But, it's possible for valid regexs (like '(a*)*') to result in
|
||||
// a cycle in the instruction list. e.g., We'll keep chasing the Split
|
||||
// instructions forever.
|
||||
// So we add these instructions to our thread queue, but in the main
|
||||
// VM loop, we look for them but simply ignore them.
|
||||
// Adding them to the queue prevents them from being revisited so we
|
||||
// can avoid cycles (and the inevitable stack overflow).
|
||||
//
|
||||
// We make a minor optimization by indicating that the state is "empty"
|
||||
// so that its capture groups are not filled in.
|
||||
match *self.prog.insts.get(pc) {
|
||||
EmptyBegin(flags) => {
|
||||
let multi = flags & FLAG_MULTI > 0;
|
||||
nlist.add(pc, groups, true);
|
||||
if self.chars.is_begin()
|
||||
|| (multi && self.char_is(self.chars.prev, '\n')) {
|
||||
self.add(nlist, pc + 1, groups)
|
||||
}
|
||||
}
|
||||
EmptyEnd(flags) => {
|
||||
let multi = flags & FLAG_MULTI > 0;
|
||||
nlist.add(pc, groups, true);
|
||||
if self.chars.is_end()
|
||||
|| (multi && self.char_is(self.chars.cur, '\n')) {
|
||||
self.add(nlist, pc + 1, groups)
|
||||
}
|
||||
}
|
||||
EmptyWordBoundary(flags) => {
|
||||
nlist.add(pc, groups, true);
|
||||
if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
|
||||
self.add(nlist, pc + 1, groups)
|
||||
}
|
||||
}
|
||||
Save(slot) => {
|
||||
nlist.add(pc, groups, true);
|
||||
match self.which {
|
||||
Location if slot <= 1 => {
|
||||
let old = groups[slot];
|
||||
groups[slot] = Some(self.ic);
|
||||
self.add(nlist, pc + 1, groups);
|
||||
groups[slot] = old;
|
||||
}
|
||||
Submatches => {
|
||||
let old = groups[slot];
|
||||
groups[slot] = Some(self.ic);
|
||||
self.add(nlist, pc + 1, groups);
|
||||
groups[slot] = old;
|
||||
}
|
||||
Exists | Location => self.add(nlist, pc + 1, groups),
|
||||
}
|
||||
}
|
||||
Jump(to) => {
|
||||
nlist.add(pc, groups, true);
|
||||
self.add(nlist, to, groups)
|
||||
}
|
||||
Split(x, y) => {
|
||||
nlist.add(pc, groups, true);
|
||||
self.add(nlist, x, groups);
|
||||
self.add(nlist, y, groups);
|
||||
}
|
||||
Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
|
||||
nlist.add(pc, groups, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: For case insensitive comparisons, it uses the uppercase
|
||||
// character and tests for equality. IIUC, this does not generalize to
|
||||
// all of Unicode. I believe we need to check the entire fold for each
|
||||
// character. This will be easy to add if and when it gets added to Rust's
|
||||
// standard library.
|
||||
#[inline]
|
||||
fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
|
||||
match textc {
|
||||
None => false,
|
||||
Some(textc) => {
|
||||
regc == textc
|
||||
|| (casei && regc.to_uppercase() == textc.to_uppercase())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn char_is(&self, textc: Option<char>, regc: char) -> bool {
|
||||
textc == Some(regc)
|
||||
}
|
||||
}
|
||||
|
||||
/// CharReader is responsible for maintaining a "previous" and a "current"
|
||||
/// character. This one-character lookahead is necessary for assertions that
|
||||
/// look one character before or after the current position.
|
||||
pub struct CharReader<'t> {
|
||||
/// The previous character read. It is None only when processing the first
|
||||
/// character of the input.
|
||||
pub prev: Option<char>,
|
||||
/// The current character.
|
||||
pub cur: Option<char>,
|
||||
input: &'t str,
|
||||
next: uint,
|
||||
}
|
||||
|
||||
impl<'t> CharReader<'t> {
|
||||
/// Returns a new CharReader that advances through the input given.
|
||||
/// Note that a CharReader has no knowledge of the range in which to search
|
||||
/// the input.
|
||||
pub fn new(input: &'t str) -> CharReader<'t> {
|
||||
CharReader {
|
||||
prev: None,
|
||||
cur: None,
|
||||
input: input,
|
||||
next: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the previous and current character given any arbitrary byte
|
||||
/// index (at a unicode codepoint boundary).
|
||||
#[inline]
|
||||
pub fn set(&mut self, ic: uint) -> uint {
|
||||
self.prev = None;
|
||||
self.cur = None;
|
||||
self.next = 0;
|
||||
|
||||
if self.input.len() == 0 {
|
||||
return 1
|
||||
}
|
||||
if ic > 0 {
|
||||
let i = cmp::min(ic, self.input.len());
|
||||
let prev = self.input.char_range_at_reverse(i);
|
||||
self.prev = Some(prev.ch);
|
||||
}
|
||||
if ic < self.input.len() {
|
||||
let cur = self.input.char_range_at(ic);
|
||||
self.cur = Some(cur.ch);
|
||||
self.next = cur.next;
|
||||
self.next
|
||||
} else {
|
||||
self.input.len() + 1
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the same as `set`, except it always advances to the next
|
||||
/// character in the input (and therefore does half as many UTF8 decodings).
|
||||
#[inline]
|
||||
pub fn advance(&mut self) -> uint {
|
||||
self.prev = self.cur;
|
||||
if self.next < self.input.len() {
|
||||
let cur = self.input.char_range_at(self.next);
|
||||
self.cur = Some(cur.ch);
|
||||
self.next = cur.next;
|
||||
} else {
|
||||
self.cur = None;
|
||||
self.next = self.input.len() + 1;
|
||||
}
|
||||
self.next
|
||||
}
|
||||
|
||||
/// Returns true if and only if this is the beginning of the input
|
||||
/// (ignoring the range of the input to search).
|
||||
#[inline]
|
||||
pub fn is_begin(&self) -> bool { self.prev.is_none() }
|
||||
|
||||
/// Returns true if and only if this is the end of the input
|
||||
/// (ignoring the range of the input to search).
|
||||
#[inline]
|
||||
pub fn is_end(&self) -> bool { self.cur.is_none() }
|
||||
|
||||
/// Returns true if and only if the current position is a word boundary.
|
||||
/// (Ignoring the range of the input to search.)
|
||||
pub fn is_word_boundary(&self) -> bool {
|
||||
if self.is_begin() {
|
||||
return is_word(self.cur)
|
||||
}
|
||||
if self.is_end() {
|
||||
return is_word(self.prev)
|
||||
}
|
||||
(is_word(self.cur) && !is_word(self.prev))
|
||||
|| (is_word(self.prev) && !is_word(self.cur))
|
||||
}
|
||||
}
|
||||
|
||||
struct Thread {
|
||||
pc: uint,
|
||||
groups: Vec<Option<uint>>,
|
||||
}
|
||||
|
||||
struct Threads {
|
||||
which: MatchKind,
|
||||
queue: Vec<Thread>,
|
||||
sparse: Vec<uint>,
|
||||
size: uint,
|
||||
}
|
||||
|
||||
impl Threads {
|
||||
// This is using a wicked neat trick to provide constant time lookup
|
||||
// for threads in the queue using a sparse set. A queue of threads is
|
||||
// allocated once with maximal size when the VM initializes and is reused
|
||||
// throughout execution. That is, there should be zero allocation during
|
||||
// the execution of a VM.
|
||||
//
|
||||
// See http://research.swtch.com/sparse for the deets.
|
||||
fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
|
||||
Threads {
|
||||
which: which,
|
||||
queue: Vec::from_fn(num_insts, |_| {
|
||||
Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) }
|
||||
}),
|
||||
sparse: Vec::from_elem(num_insts, 0u),
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
|
||||
let t = self.queue.get_mut(self.size);
|
||||
t.pc = pc;
|
||||
match (empty, self.which) {
|
||||
(_, Exists) | (true, _) => {},
|
||||
(false, Location) => {
|
||||
*t.groups.get_mut(0) = groups[0];
|
||||
*t.groups.get_mut(1) = groups[1];
|
||||
}
|
||||
(false, Submatches) => {
|
||||
for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
|
||||
*slot = *val;
|
||||
}
|
||||
}
|
||||
}
|
||||
*self.sparse.get_mut(pc) = self.size;
|
||||
self.size += 1;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn contains(&self, pc: uint) -> bool {
|
||||
let s = *self.sparse.get(pc);
|
||||
s < self.size && self.queue.get(s).pc == pc
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn empty(&mut self) {
|
||||
self.size = 0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn pc(&self, i: uint) -> uint {
|
||||
self.queue.get(i).pc
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
|
||||
self.queue.get_mut(i).groups.as_mut_slice()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the character is a word character, according to the
|
||||
/// (Unicode friendly) Perl character class '\w'.
|
||||
/// Note that this is only use for testing word boundaries. The actual '\w'
|
||||
/// is encoded as a CharClass instruction.
|
||||
pub fn is_word(c: Option<char>) -> bool {
|
||||
let c = match c {
|
||||
None => return false,
|
||||
Some(c) => c,
|
||||
};
|
||||
// Try the common ASCII case before invoking binary search.
|
||||
match c {
|
||||
'_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true,
|
||||
_ => PERLW.bsearch(|&(start, end)| {
|
||||
if c >= start && c <= end {
|
||||
Equal
|
||||
} else if start > c {
|
||||
Greater
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
}).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a character and a single character class range, return an ordering
|
||||
/// indicating whether the character is less than the start of the range,
|
||||
/// in the range (inclusive) or greater than the end of the range.
|
||||
///
|
||||
/// If `casei` is `true`, then this ordering is computed case insensitively.
|
||||
///
|
||||
/// This function is meant to be used with a binary search.
|
||||
#[inline]
|
||||
fn class_cmp(casei: bool, mut textc: char,
|
||||
(mut start, mut end): (char, char)) -> Ordering {
|
||||
if casei {
|
||||
// FIXME: This is pretty ridiculous. All of this case conversion
|
||||
// can be moved outside this function:
|
||||
// 1) textc should be uppercased outside the bsearch.
|
||||
// 2) the character class itself should be uppercased either in the
|
||||
// parser or the compiler.
|
||||
// FIXME: This is too simplistic for correct Unicode support.
|
||||
// See also: char_eq
|
||||
textc = textc.to_uppercase();
|
||||
start = start.to_uppercase();
|
||||
end = end.to_uppercase();
|
||||
}
|
||||
if textc >= start && textc <= end {
|
||||
Equal
|
||||
} else if start > textc {
|
||||
Greater
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the starting location of `needle` in `haystack`.
|
||||
/// If `needle` is not in `haystack`, then `None` is returned.
|
||||
///
|
||||
/// Note that this is using a naive substring algorithm.
|
||||
#[inline]
|
||||
pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
|
||||
let (hlen, nlen) = (haystack.len(), needle.len());
|
||||
if nlen > hlen || nlen == 0 {
|
||||
return None
|
||||
}
|
||||
let mut hayi = 0u;
|
||||
'HAYSTACK: loop {
|
||||
if hayi > hlen - nlen {
|
||||
break
|
||||
}
|
||||
let mut nedi = 0;
|
||||
while nedi < nlen {
|
||||
if haystack[hayi+nedi] != needle[nedi] {
|
||||
hayi += 1;
|
||||
continue 'HAYSTACK
|
||||
}
|
||||
nedi += 1;
|
||||
}
|
||||
return Some(hayi)
|
||||
}
|
||||
None
|
||||
}
|
684
src/libregex_macros/lib.rs
Normal file
684
src/libregex_macros/lib.rs
Normal file
@ -0,0 +1,684 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! This crate provides the `regex!` macro. Its use is documented in the
|
||||
//! `regex` crate.
|
||||
|
||||
#![crate_id = "regex_macros#0.11-pre"]
|
||||
#![crate_type = "dylib"]
|
||||
#![experimental]
|
||||
#![license = "MIT/ASL2"]
|
||||
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
|
||||
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
|
||||
html_root_url = "http://static.rust-lang.org/doc/master")]
|
||||
|
||||
#![feature(macro_registrar, managed_boxes, quote)]
|
||||
|
||||
extern crate regex;
|
||||
extern crate syntax;
|
||||
|
||||
use syntax::ast;
|
||||
use syntax::codemap;
|
||||
use syntax::ext::base::{
|
||||
SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult,
|
||||
NormalTT, BasicMacroExpander,
|
||||
};
|
||||
use syntax::parse;
|
||||
use syntax::parse::token;
|
||||
use syntax::print::pprust;
|
||||
|
||||
use regex::Regex;
|
||||
use regex::native::{
|
||||
OneChar, CharClass, Any, Save, Jump, Split,
|
||||
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
|
||||
Program, Dynamic, Native,
|
||||
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
|
||||
};
|
||||
|
||||
/// For the `regex!` syntax extension. Do not use.
|
||||
#[macro_registrar]
|
||||
#[doc(hidden)]
|
||||
pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) {
|
||||
let expander = ~BasicMacroExpander { expander: native, span: None };
|
||||
register(token::intern("regex"), NormalTT(expander, None))
|
||||
}
|
||||
|
||||
/// Generates specialized code for the Pike VM for a particular regular
|
||||
/// expression.
|
||||
///
|
||||
/// There are two primary differences between the code generated here and the
|
||||
/// general code in vm.rs.
|
||||
///
|
||||
/// 1. All heap allocation is removed. Sized vector types are used instead.
|
||||
/// Care must be taken to make sure that these vectors are not copied
|
||||
/// gratuitously. (If you're not sure, run the benchmarks. They will yell
|
||||
/// at you if you do.)
|
||||
/// 2. The main `match instruction { ... }` expressions are replaced with more
|
||||
/// direct `match pc { ... }`. The generators can be found in
|
||||
/// `step_insts` and `add_insts`.
|
||||
///
|
||||
/// Other more minor changes include eliding code when possible (although this
|
||||
/// isn't completely thorough at the moment), and translating character class
|
||||
/// matching from using a binary search to a simple `match` expression (see
|
||||
/// `match_class`).
|
||||
///
|
||||
/// It is strongly recommended to read the dynamic implementation in vm.rs
|
||||
/// first before trying to understand the code generator. The implementation
|
||||
/// strategy is identical and vm.rs has comments and will be easier to follow.
|
||||
fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
|
||||
-> ~MacResult {
|
||||
let regex = match parse(cx, tts) {
|
||||
Some(r) => r,
|
||||
// error is logged in 'parse' with cx.span_err
|
||||
None => return DummyResult::any(sp),
|
||||
};
|
||||
let re = match Regex::new(regex.to_owned()) {
|
||||
Ok(re) => re,
|
||||
Err(err) => {
|
||||
cx.span_err(sp, err.to_str());
|
||||
return DummyResult::any(sp)
|
||||
}
|
||||
};
|
||||
let prog = match re.p {
|
||||
Dynamic(ref prog) => prog.clone(),
|
||||
Native(_) => unreachable!(),
|
||||
};
|
||||
|
||||
let mut gen = NfaGen {
|
||||
cx: &*cx, sp: sp, prog: prog,
|
||||
names: re.names.clone(), original: re.original.clone(),
|
||||
};
|
||||
MacExpr::new(gen.code())
|
||||
}
|
||||
|
||||
struct NfaGen<'a> {
|
||||
cx: &'a ExtCtxt<'a>,
|
||||
sp: codemap::Span,
|
||||
prog: Program,
|
||||
names: ~[Option<~str>],
|
||||
original: ~str,
|
||||
}
|
||||
|
||||
impl<'a> NfaGen<'a> {
|
||||
fn code(&mut self) -> @ast::Expr {
|
||||
// Most or all of the following things are used in the quasiquoted
|
||||
// expression returned.
|
||||
let num_cap_locs = 2 * self.prog.num_captures();
|
||||
let num_insts = self.prog.insts.len();
|
||||
let cap_names = self.vec_expr(self.names,
|
||||
|cx, name| match name {
|
||||
&Some(ref name) => {
|
||||
let name = name.as_slice();
|
||||
quote_expr!(cx, Some(~$name))
|
||||
}
|
||||
&None => quote_expr!(cx, None),
|
||||
}
|
||||
);
|
||||
let prefix_anchor =
|
||||
match self.prog.insts.as_slice()[1] {
|
||||
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
|
||||
_ => false,
|
||||
};
|
||||
let init_groups = self.vec_from_fn(num_cap_locs,
|
||||
|cx| quote_expr!(cx, None));
|
||||
let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(),
|
||||
|cx, b| quote_expr!(cx, $b));
|
||||
let check_prefix = self.check_prefix();
|
||||
let step_insts = self.step_insts();
|
||||
let add_insts = self.add_insts();
|
||||
let regex = self.original.as_slice();
|
||||
|
||||
quote_expr!(self.cx, {
|
||||
fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
|
||||
start: uint, end: uint) -> Vec<Option<uint>> {
|
||||
#![allow(unused_imports)]
|
||||
use regex::native::{
|
||||
MatchKind, Exists, Location, Submatches,
|
||||
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
|
||||
CharReader, find_prefix,
|
||||
};
|
||||
|
||||
return Nfa {
|
||||
which: which,
|
||||
input: input,
|
||||
ic: 0,
|
||||
chars: CharReader::new(input),
|
||||
}.run(start, end);
|
||||
|
||||
type Captures = [Option<uint>, ..$num_cap_locs];
|
||||
|
||||
struct Nfa<'t> {
|
||||
which: MatchKind,
|
||||
input: &'t str,
|
||||
ic: uint,
|
||||
chars: CharReader<'t>,
|
||||
}
|
||||
|
||||
impl<'t> Nfa<'t> {
|
||||
#[allow(unused_variable)]
|
||||
fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
|
||||
let mut matched = false;
|
||||
let prefix_bytes: &[u8] = &$prefix_bytes;
|
||||
let mut clist = &mut Threads::new(self.which);
|
||||
let mut nlist = &mut Threads::new(self.which);
|
||||
|
||||
let mut groups = $init_groups;
|
||||
|
||||
self.ic = start;
|
||||
let mut next_ic = self.chars.set(start);
|
||||
while self.ic <= end {
|
||||
if clist.size == 0 {
|
||||
if matched {
|
||||
break
|
||||
}
|
||||
$check_prefix
|
||||
}
|
||||
if clist.size == 0 || (!$prefix_anchor && !matched) {
|
||||
self.add(clist, 0, &mut groups)
|
||||
}
|
||||
|
||||
self.ic = next_ic;
|
||||
next_ic = self.chars.advance();
|
||||
|
||||
let mut i = 0;
|
||||
while i < clist.size {
|
||||
let pc = clist.pc(i);
|
||||
let step_state = self.step(&mut groups, nlist,
|
||||
clist.groups(i), pc);
|
||||
match step_state {
|
||||
StepMatchEarlyReturn =>
|
||||
return vec![Some(0u), Some(0u)],
|
||||
StepMatch => { matched = true; clist.empty() },
|
||||
StepContinue => {},
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
::std::mem::swap(&mut clist, &mut nlist);
|
||||
nlist.empty();
|
||||
}
|
||||
match self.which {
|
||||
Exists if matched => vec![Some(0u), Some(0u)],
|
||||
Exists => vec![None, None],
|
||||
Location | Submatches => groups.iter().map(|x| *x).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
// Sometimes `nlist` is never used (for empty regexes).
|
||||
#[allow(unused_variable)]
|
||||
#[inline]
|
||||
fn step(&self, groups: &mut Captures, nlist: &mut Threads,
|
||||
caps: &mut Captures, pc: uint) -> StepState {
|
||||
$step_insts
|
||||
StepContinue
|
||||
}
|
||||
|
||||
fn add(&self, nlist: &mut Threads, pc: uint,
|
||||
groups: &mut Captures) {
|
||||
if nlist.contains(pc) {
|
||||
return
|
||||
}
|
||||
$add_insts
|
||||
}
|
||||
}
|
||||
|
||||
struct Thread {
|
||||
pc: uint,
|
||||
groups: Captures,
|
||||
}
|
||||
|
||||
struct Threads {
|
||||
which: MatchKind,
|
||||
queue: [Thread, ..$num_insts],
|
||||
sparse: [uint, ..$num_insts],
|
||||
size: uint,
|
||||
}
|
||||
|
||||
impl Threads {
|
||||
fn new(which: MatchKind) -> Threads {
|
||||
Threads {
|
||||
which: which,
|
||||
// These unsafe blocks are used for performance reasons, as it
|
||||
// gives us a zero-cost initialization of a sparse set. The
|
||||
// trick is described in more detail here:
|
||||
// http://research.swtch.com/sparse
|
||||
// The idea here is to avoid initializing threads that never
|
||||
// need to be initialized, particularly for larger regexs with
|
||||
// a lot of instructions.
|
||||
queue: unsafe { ::std::mem::uninit() },
|
||||
sparse: unsafe { ::std::mem::uninit() },
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn add(&mut self, pc: uint, groups: &Captures) {
|
||||
let t = &mut self.queue[self.size];
|
||||
t.pc = pc;
|
||||
match self.which {
|
||||
Exists => {},
|
||||
Location => {
|
||||
t.groups[0] = groups[0];
|
||||
t.groups[1] = groups[1];
|
||||
}
|
||||
Submatches => {
|
||||
for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
|
||||
*slot = *val;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.sparse[pc] = self.size;
|
||||
self.size += 1;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn add_empty(&mut self, pc: uint) {
|
||||
self.queue[self.size].pc = pc;
|
||||
self.sparse[pc] = self.size;
|
||||
self.size += 1;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn contains(&self, pc: uint) -> bool {
|
||||
let s = self.sparse[pc];
|
||||
s < self.size && self.queue[s].pc == pc
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn empty(&mut self) {
|
||||
self.size = 0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn pc(&self, i: uint) -> uint {
|
||||
self.queue[i].pc
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
|
||||
&'r mut self.queue[i].groups
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::regex::Regex {
|
||||
original: ~$regex,
|
||||
names: ~$cap_names,
|
||||
p: ::regex::native::Native(exec),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Generates code for the `add` method, which is responsible for adding
|
||||
// zero-width states to the next queue of states to visit.
|
||||
fn add_insts(&self) -> @ast::Expr {
|
||||
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
|
||||
let nextpc = pc + 1;
|
||||
let body = match *inst {
|
||||
EmptyBegin(flags) => {
|
||||
let nl = '\n';
|
||||
let cond =
|
||||
if flags & FLAG_MULTI > 0 {
|
||||
quote_expr!(self.cx,
|
||||
self.chars.is_begin()
|
||||
|| self.chars.prev == Some($nl)
|
||||
)
|
||||
} else {
|
||||
quote_expr!(self.cx, self.chars.is_begin())
|
||||
};
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
if $cond { self.add(nlist, $nextpc, &mut *groups) }
|
||||
})
|
||||
}
|
||||
EmptyEnd(flags) => {
|
||||
let nl = '\n';
|
||||
let cond =
|
||||
if flags & FLAG_MULTI > 0 {
|
||||
quote_expr!(self.cx,
|
||||
self.chars.is_end()
|
||||
|| self.chars.cur == Some($nl)
|
||||
)
|
||||
} else {
|
||||
quote_expr!(self.cx, self.chars.is_end())
|
||||
};
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
if $cond { self.add(nlist, $nextpc, &mut *groups) }
|
||||
})
|
||||
}
|
||||
EmptyWordBoundary(flags) => {
|
||||
let cond =
|
||||
if flags & FLAG_NEGATED > 0 {
|
||||
quote_expr!(self.cx, !self.chars.is_word_boundary())
|
||||
} else {
|
||||
quote_expr!(self.cx, self.chars.is_word_boundary())
|
||||
};
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
if $cond { self.add(nlist, $nextpc, &mut *groups) }
|
||||
})
|
||||
}
|
||||
Save(slot) => {
|
||||
let save = quote_expr!(self.cx, {
|
||||
let old = groups[$slot];
|
||||
groups[$slot] = Some(self.ic);
|
||||
self.add(nlist, $nextpc, &mut *groups);
|
||||
groups[$slot] = old;
|
||||
});
|
||||
let add = quote_expr!(self.cx, {
|
||||
self.add(nlist, $nextpc, &mut *groups);
|
||||
});
|
||||
// If this is saving a submatch location but we request
|
||||
// existence or only full match location, then we can skip
|
||||
// right over it every time.
|
||||
if slot > 1 {
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
match self.which {
|
||||
Submatches => $save,
|
||||
Exists | Location => $add,
|
||||
}
|
||||
})
|
||||
} else {
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
match self.which {
|
||||
Submatches | Location => $save,
|
||||
Exists => $add,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Jump(to) => {
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
self.add(nlist, $to, &mut *groups);
|
||||
})
|
||||
}
|
||||
Split(x, y) => {
|
||||
quote_expr!(self.cx, {
|
||||
nlist.add_empty($pc);
|
||||
self.add(nlist, $x, &mut *groups);
|
||||
self.add(nlist, $y, &mut *groups);
|
||||
})
|
||||
}
|
||||
// For Match, OneChar, CharClass, Any
|
||||
_ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
|
||||
};
|
||||
self.arm_inst(pc, body)
|
||||
}).collect::<Vec<ast::Arm>>();
|
||||
|
||||
self.match_insts(arms)
|
||||
}
|
||||
|
||||
// Generates the code for the `step` method, which processes all states
|
||||
// in the current queue that consume a single character.
|
||||
fn step_insts(&self) -> @ast::Expr {
|
||||
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
|
||||
let nextpc = pc + 1;
|
||||
let body = match *inst {
|
||||
Match => {
|
||||
quote_expr!(self.cx, {
|
||||
match self.which {
|
||||
Exists => {
|
||||
return StepMatchEarlyReturn
|
||||
}
|
||||
Location => {
|
||||
groups[0] = caps[0];
|
||||
groups[1] = caps[1];
|
||||
return StepMatch
|
||||
}
|
||||
Submatches => {
|
||||
for (slot, val) in groups.mut_iter().zip(caps.iter()) {
|
||||
*slot = *val;
|
||||
}
|
||||
return StepMatch
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
OneChar(c, flags) => {
|
||||
if flags & FLAG_NOCASE > 0 {
|
||||
let upc = c.to_uppercase();
|
||||
quote_expr!(self.cx, {
|
||||
let upc = self.chars.prev.map(|c| c.to_uppercase());
|
||||
if upc == Some($upc) {
|
||||
self.add(nlist, $nextpc, caps);
|
||||
}
|
||||
})
|
||||
} else {
|
||||
quote_expr!(self.cx, {
|
||||
if self.chars.prev == Some($c) {
|
||||
self.add(nlist, $nextpc, caps);
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
CharClass(ref ranges, flags) => {
|
||||
let negate = flags & FLAG_NEGATED > 0;
|
||||
let casei = flags & FLAG_NOCASE > 0;
|
||||
let get_char =
|
||||
if casei {
|
||||
quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
|
||||
} else {
|
||||
quote_expr!(self.cx, self.chars.prev.unwrap())
|
||||
};
|
||||
let negcond =
|
||||
if negate {
|
||||
quote_expr!(self.cx, !found)
|
||||
} else {
|
||||
quote_expr!(self.cx, found)
|
||||
};
|
||||
let mranges = self.match_class(casei, ranges.as_slice());
|
||||
quote_expr!(self.cx, {
|
||||
if self.chars.prev.is_some() {
|
||||
let c = $get_char;
|
||||
let found = $mranges;
|
||||
if $negcond {
|
||||
self.add(nlist, $nextpc, caps);
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
Any(flags) => {
|
||||
if flags & FLAG_DOTNL > 0 {
|
||||
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
|
||||
} else {
|
||||
let nl = '\n'; // no char lits allowed? wtf?
|
||||
quote_expr!(self.cx, {
|
||||
if self.chars.prev != Some($nl) {
|
||||
self.add(nlist, $nextpc, caps)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
// EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
|
||||
_ => quote_expr!(self.cx, {}),
|
||||
};
|
||||
self.arm_inst(pc, body)
|
||||
}).collect::<Vec<ast::Arm>>();
|
||||
|
||||
self.match_insts(arms)
|
||||
}
|
||||
|
||||
// Translates a character class into a match expression.
|
||||
// This avoids a binary search (and is hopefully replaced by a jump
|
||||
// table).
|
||||
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr {
|
||||
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
|
||||
if casei {
|
||||
start = start.to_uppercase();
|
||||
end = end.to_uppercase();
|
||||
}
|
||||
ast::Arm {
|
||||
attrs: vec!(),
|
||||
pats: vec!(@ast::Pat{
|
||||
id: ast::DUMMY_NODE_ID,
|
||||
span: self.sp,
|
||||
node: ast::PatRange(quote_expr!(self.cx, $start),
|
||||
quote_expr!(self.cx, $end)),
|
||||
}),
|
||||
guard: None,
|
||||
body: quote_expr!(self.cx, true),
|
||||
}
|
||||
}).collect::<Vec<ast::Arm>>();
|
||||
|
||||
arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
|
||||
|
||||
let match_on = quote_expr!(self.cx, c);
|
||||
self.dummy_expr(ast::ExprMatch(match_on, arms))
|
||||
}
|
||||
|
||||
// Generates code for checking a literal prefix of the search string.
|
||||
// The code is only generated if the regex *has* a literal prefix.
|
||||
// Otherwise, a no-op is returned.
|
||||
fn check_prefix(&self) -> @ast::Expr {
|
||||
if self.prog.prefix.len() == 0 {
|
||||
quote_expr!(self.cx, {})
|
||||
} else {
|
||||
quote_expr!(self.cx,
|
||||
if clist.size == 0 {
|
||||
let haystack = self.input.as_bytes().slice_from(self.ic);
|
||||
match find_prefix(prefix_bytes, haystack) {
|
||||
None => break,
|
||||
Some(i) => {
|
||||
self.ic += i;
|
||||
next_ic = self.chars.set(self.ic);
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Builds a `match pc { ... }` expression from a list of arms, specifically
|
||||
// for matching the current program counter with an instruction.
|
||||
// A wild-card arm is automatically added that executes a no-op. It will
|
||||
// never be used, but is added to satisfy the compiler complaining about
|
||||
// non-exhaustive patterns.
|
||||
fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr {
|
||||
let mat_pc = quote_expr!(self.cx, pc);
|
||||
arms.push(self.wild_arm_expr(quote_expr!(self.cx, {})));
|
||||
self.dummy_expr(ast::ExprMatch(mat_pc, arms))
|
||||
}
|
||||
|
||||
// Creates a match arm for the instruction at `pc` with the expression
|
||||
// `body`.
|
||||
fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm {
|
||||
ast::Arm {
|
||||
attrs: vec!(),
|
||||
pats: vec!(@ast::Pat{
|
||||
id: ast::DUMMY_NODE_ID,
|
||||
span: self.sp,
|
||||
node: ast::PatLit(quote_expr!(self.cx, $pc)),
|
||||
}),
|
||||
guard: None,
|
||||
body: body,
|
||||
}
|
||||
}
|
||||
|
||||
// Creates a wild-card match arm with the expression `body`.
|
||||
fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm {
|
||||
ast::Arm {
|
||||
attrs: vec!(),
|
||||
pats: vec!(@ast::Pat{
|
||||
id: ast::DUMMY_NODE_ID,
|
||||
span: self.sp,
|
||||
node: ast::PatWild,
|
||||
}),
|
||||
guard: None,
|
||||
body: body,
|
||||
}
|
||||
}
|
||||
|
||||
// Builds a `[a, b, .., len]` expression where each element is the result
|
||||
// of executing `to_expr`.
|
||||
fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr)
|
||||
-> @ast::Expr {
|
||||
self.vec_expr(Vec::from_elem(len, ()).as_slice(),
|
||||
|cx, _| to_expr(cx))
|
||||
}
|
||||
|
||||
// Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
|
||||
// on each element in `xs`.
|
||||
fn vec_expr<T>(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr)
|
||||
-> @ast::Expr {
|
||||
let mut exprs = vec!();
|
||||
for x in xs.iter() {
|
||||
exprs.push(to_expr(self.cx, x))
|
||||
}
|
||||
let vec_exprs = self.dummy_expr(ast::ExprVec(exprs));
|
||||
quote_expr!(self.cx, $vec_exprs)
|
||||
}
|
||||
|
||||
// Creates an expression with a dummy node ID given an underlying
|
||||
// `ast::Expr_`.
|
||||
fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr {
|
||||
@ast::Expr {
|
||||
id: ast::DUMMY_NODE_ID,
|
||||
node: e,
|
||||
span: self.sp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This trait is defined in the quote module in the syntax crate, but I
|
||||
// don't think it's exported.
|
||||
// Interestingly, quote_expr! only requires that a 'to_tokens' method be
|
||||
// defined rather than satisfying a particular trait.
|
||||
#[doc(hidden)]
|
||||
trait ToTokens {
|
||||
fn to_tokens(&self, cx: &ExtCtxt) -> Vec<ast::TokenTree>;
|
||||
}
|
||||
|
||||
impl ToTokens for char {
|
||||
fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
|
||||
vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32)))
|
||||
}
|
||||
}
|
||||
|
||||
impl ToTokens for bool {
|
||||
fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
|
||||
let ident = token::IDENT(token::str_to_ident(self.to_str()), false);
|
||||
vec!(ast::TTTok(codemap::DUMMY_SP, ident))
|
||||
}
|
||||
}
|
||||
|
||||
/// Looks for a single string literal and returns it.
|
||||
/// Otherwise, logs an error with cx.span_err and returns None.
|
||||
fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> {
|
||||
let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(),
|
||||
Vec::from_slice(tts));
|
||||
let entry = cx.expand_expr(parser.parse_expr());
|
||||
let regex = match entry.node {
|
||||
ast::ExprLit(lit) => {
|
||||
match lit.node {
|
||||
ast::LitStr(ref s, _) => s.to_str(),
|
||||
_ => {
|
||||
cx.span_err(entry.span, format!(
|
||||
"expected string literal but got `{}`",
|
||||
pprust::lit_to_str(lit)));
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
cx.span_err(entry.span, format!(
|
||||
"expected string literal but got `{}`",
|
||||
pprust::expr_to_str(entry)));
|
||||
return None
|
||||
}
|
||||
};
|
||||
if !parser.eat(&token::EOF) {
|
||||
cx.span_err(parser.span, "only one string literal allowed");
|
||||
return None;
|
||||
}
|
||||
Some(regex)
|
||||
}
|
94
src/test/bench/shootout-regex-dna.rs
Normal file
94
src/test/bench/shootout-regex-dna.rs
Normal file
@ -0,0 +1,94 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// ignore-stage1
|
||||
// ignore-cross-compile #12102
|
||||
|
||||
#![feature(macro_rules, phase)]
|
||||
|
||||
extern crate regex;
|
||||
#[phase(syntax)]extern crate regex_macros;
|
||||
extern crate sync;
|
||||
|
||||
use std::io;
|
||||
use regex::{NoExpand, Regex};
|
||||
use sync::Arc;
|
||||
|
||||
fn count_matches(seq: &str, variant: &Regex) -> int {
|
||||
let mut n = 0;
|
||||
for _ in variant.find_iter(seq) {
|
||||
n += 1;
|
||||
}
|
||||
n
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut rdr = if std::os::getenv("RUST_BENCH").is_some() {
|
||||
let fd = io::File::open(&Path::new("shootout-k-nucleotide.data"));
|
||||
~io::BufferedReader::new(fd) as ~io::Reader
|
||||
} else {
|
||||
~io::stdin() as ~io::Reader
|
||||
};
|
||||
let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap());
|
||||
let ilen = seq.len();
|
||||
|
||||
seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand(""));
|
||||
let seq_arc = Arc::new(seq.clone()); // copy before it moves
|
||||
let clen = seq.len();
|
||||
|
||||
let mut seqlen = sync::Future::spawn(proc() {
|
||||
let substs = ~[
|
||||
(regex!("B"), "(c|g|t)"),
|
||||
(regex!("D"), "(a|g|t)"),
|
||||
(regex!("H"), "(a|c|t)"),
|
||||
(regex!("K"), "(g|t)"),
|
||||
(regex!("M"), "(a|c)"),
|
||||
(regex!("N"), "(a|c|g|t)"),
|
||||
(regex!("R"), "(a|g)"),
|
||||
(regex!("S"), "(c|g)"),
|
||||
(regex!("V"), "(a|c|g)"),
|
||||
(regex!("W"), "(a|t)"),
|
||||
(regex!("Y"), "(c|t)"),
|
||||
];
|
||||
let mut seq = seq;
|
||||
for (re, replacement) in substs.move_iter() {
|
||||
seq = re.replace_all(seq.as_slice(), NoExpand(replacement));
|
||||
}
|
||||
seq.len()
|
||||
});
|
||||
|
||||
let variants = ~[
|
||||
regex!("agggtaaa|tttaccct"),
|
||||
regex!("[cgt]gggtaaa|tttaccc[acg]"),
|
||||
regex!("a[act]ggtaaa|tttacc[agt]t"),
|
||||
regex!("ag[act]gtaaa|tttac[agt]ct"),
|
||||
regex!("agg[act]taaa|ttta[agt]cct"),
|
||||
regex!("aggg[acg]aaa|ttt[cgt]ccct"),
|
||||
regex!("agggt[cgt]aa|tt[acg]accct"),
|
||||
regex!("agggta[cgt]a|t[acg]taccct"),
|
||||
regex!("agggtaa[cgt]|[acg]ttaccct"),
|
||||
];
|
||||
let (mut variant_strs, mut counts) = (vec!(), vec!());
|
||||
for variant in variants.move_iter() {
|
||||
let seq_arc_copy = seq_arc.clone();
|
||||
variant_strs.push(variant.to_str().to_owned());
|
||||
counts.push(sync::Future::spawn(proc() {
|
||||
count_matches(seq_arc_copy.as_slice(), &variant)
|
||||
}));
|
||||
}
|
||||
|
||||
for (i, variant) in variant_strs.iter().enumerate() {
|
||||
println!("{} {}", variant, counts.get_mut(i).get());
|
||||
}
|
||||
println!("");
|
||||
println!("{}", ilen);
|
||||
println!("{}", clen);
|
||||
println!("{}", seqlen.get());
|
||||
}
|
26
src/test/compile-fail/syntax-extension-regex-invalid.rs
Normal file
26
src/test/compile-fail/syntax-extension-regex-invalid.rs
Normal file
@ -0,0 +1,26 @@
|
||||
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// ignore-stage1
|
||||
|
||||
#![feature(phase)]
|
||||
|
||||
extern crate regex;
|
||||
#[phase(syntax)] extern crate regex_macros;
|
||||
|
||||
// Tests to make sure that `regex!` will produce a compile error when given
|
||||
// an invalid regular expression.
|
||||
// More exhaustive failure tests for the parser are done with the traditional
|
||||
// unit testing infrastructure, since both dynamic and native regexes use the
|
||||
// same parser.
|
||||
|
||||
fn main() {
|
||||
let _ = regex!("("); //~ ERROR Regex syntax error
|
||||
}
|
Loading…
Reference in New Issue
Block a user