Add a regex crate to the Rust distribution.

Also adds a regex_macros crate, which provides natively compiled
regular expressions with a syntax extension.

Closes #3591.

RFC: 0007-regexps
This commit is contained in:
Andrew Gallant 2014-04-25 00:27:24 -04:00
parent 66486518d5
commit b8b7484703
23 changed files with 11102 additions and 2 deletions

View File

@ -51,8 +51,8 @@
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
uuid serialize sync getopts collections num test time rand \
workcache url log
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
workcache url log regex
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
TOOLS := compiletest rustdoc rustc
@ -84,6 +84,8 @@ DEPS_rand := std
DEPS_url := std collections
DEPS_workcache := std serialize collections log
DEPS_log := std sync
DEPS_regex := std collections
DEPS_regex_macros = syntax std regex
TOOL_DEPS_compiletest := test green rustuv getopts
TOOL_DEPS_rustdoc := rustdoc native

View File

@ -19,6 +19,7 @@ Source layout:
| `libfourcc/` | Data format identifier library |
| `libgetopts/` | Get command-line-options library |
| `libglob/` | Unix glob patterns library |
| `libregex/` | Regular expressions |
| `libsemver/` | Rust's semantic versioning library |
| `libserialize/` | Encode-Decode types library |
| `libsync/` | Concurrency mechanisms and primitives |

View File

@ -41,6 +41,7 @@ li {list-style-type: none; }
* [The `native` 1:1 threading runtime](native/index.html)
* [The `num` arbitrary precision numerics library](num/index.html)
* [The `rand` library for random numbers and distributions](rand/index.html)
* [The `regex` library for regular expressions](regex/index.html)
* [The `rustc` compiler](rustc/index.html)
* [The `rustuv` M:N I/O library](rustuv/index.html)
* [The `semver` version collation library](semver/index.html)

109
src/etc/regex-match-tests.py Executable file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python2
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
from __future__ import absolute_import, division, print_function
import argparse
import datetime
import os.path as path
def print_tests(tests):
print('\n'.join([test_tostr(t) for t in tests]))
def read_tests(f):
basename, _ = path.splitext(path.basename(f))
tests = []
for lineno, line in enumerate(open(f), 1):
fields = filter(None, map(str.strip, line.split('\t')))
if not (4 <= len(fields) <= 5) \
or 'E' not in fields[0] or fields[0][0] == '#':
continue
opts, pat, text, sgroups = fields[0:4]
groups = [] # groups as integer ranges
if sgroups == 'NOMATCH':
groups = [None]
elif ',' in sgroups:
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
for g in noparen:
s, e = map(str.strip, g.split(','))
if s == '?' and e == '?':
groups.append(None)
else:
groups.append((int(s), int(e)))
else:
# This skips tests that should result in an error.
# There aren't many, so I think we can just capture those
# manually. Possibly fix this in future.
continue
if pat == 'SAME':
pat = tests[-1][1]
if '$' in opts:
pat = pat.decode('string_escape')
text = text.decode('string_escape')
if 'i' in opts:
pat = '(?i)%s' % pat
name = '%s_%d' % (basename, lineno)
tests.append((name, pat, text, groups))
return tests
def test_tostr(t):
lineno, pat, text, groups = t
options = map(group_tostr, groups)
return 'mat!(match_%s, r"%s", r"%s", %s)' \
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
def group_tostr(g):
if g is None:
return 'None'
else:
return 'Some((%d, %d))' % (g[0], g[1])
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate match tests from an AT&T POSIX test file.')
aa = parser.add_argument
aa('files', nargs='+',
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
args = parser.parse_args()
tests = []
for f in args.files:
tests += read_tests(f)
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// ignore-tidy-linelength
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
// on {date}.
'''
print(tpl.format(date=str(datetime.datetime.now())))
for f in args.files:
print('// Tests from %s' % path.basename(f))
print_tests(read_tests(f))
print('')

183
src/etc/regex-unicode-tables.py Executable file
View File

@ -0,0 +1,183 @@
#!/usr/bin/env python2
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
from __future__ import absolute_import, division, print_function
import argparse
from collections import defaultdict
import csv
import datetime
import urllib2
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
DATA = 'UnicodeData.txt'
SCRIPTS = 'Scripts.txt'
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
def as_4byte_uni(n):
s = hex(n)[2:]
return '\\U%s%s' % ('0' * (8 - len(s)), s)
def expand_cat(c):
return expanded_categories.get(c, []) + [c]
def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
def read_cats(f):
assigned = defaultdict(list)
for row in csv.reader(f, delimiter=';'):
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
if not is_valid_unicode(hex):
continue
for cat in cats:
assigned[cat].append(hex)
return assigned
def read_scripts(f):
assigned = defaultdict(list)
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
hexes, name = map(str.strip, line.split(';'))[:2]
name = name[:name.index('#')].strip()
if '..' not in hexes:
hex = int(hexes, 16)
if is_valid_unicode(hex):
assigned[name].append(hex)
else:
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
for hex in xrange(hex1, hex2 + 1):
if is_valid_unicode(hex):
assigned[name].append(hex)
return assigned
def group(letters):
letters = sorted(set(letters))
grouped = []
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
grouped.append((cur_start, cur_end))
cur_start, cur_end = letter, letter
grouped.append((cur_start, cur_end))
return grouped
def ranges_to_rust(rs):
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
return ',\n '.join(rs)
def groups_to_rust(groups):
rust_groups = []
for group_name in sorted(groups):
rust_groups.append('("%s", &[\n %s\n ]),'
% (group_name, ranges_to_rust(groups[group_name])))
return '\n'.join(rust_groups)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate Unicode character class tables.')
aa = parser.add_argument
aa('--local', action='store_true',
help='When set, Scripts.txt and UnicodeData.txt will be read from '
'the CWD.')
aa('--base-url', type=str, default=BASE_URL,
help='The base URL to use for downloading Unicode data files.')
args = parser.parse_args()
if args.local:
cats = read_cats(open(DATA))
scripts = read_scripts(open(SCRIPTS))
else:
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
# Get Rust code for all Unicode general categories and scripts.
combined = dict(cats, **scripts)
unigroups = groups_to_rust({k: group(letters)
for k, letters in combined.items()})
# Now get Perl character classes that are Unicode friendly.
perld = range(ord('0'), ord('9') + 1)
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
perlw = [ord('_')] + perld + low + up
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
// on {date}.
use parse::{{Class, NamedClasses}};
pub static UNICODE_CLASSES: NamedClasses = &[
{groups}
];
pub static PERLD: Class = &[
{dgroups}
];
pub static PERLS: Class = &[
{sgroups}
];
pub static PERLW: Class = &[
{wgroups}
];
'''
now = datetime.datetime.now()
print(tpl.format(date=str(now), groups=unigroups,
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))

274
src/libregex/compile.rs Normal file
View File

@ -0,0 +1,274 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Enable this to squash warnings due to exporting pieces of the representation
// for use with the regex! macro. See lib.rs for explanation.
#![allow(visible_private_types)]
use std::cmp;
use std::iter;
use parse;
use parse::{
Flags, FLAG_EMPTY,
Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt,
Rep,
ZeroOne, ZeroMore, OneMore,
};
type InstIdx = uint;
#[deriving(Show, Clone)]
pub enum Inst {
// When a Match instruction is executed, the current thread is successful.
Match,
// The OneChar instruction matches a literal character.
// The flags indicate whether to do a case insensitive match.
OneChar(char, Flags),
// The CharClass instruction tries to match one input character against
// the range of characters given.
// The flags indicate whether to do a case insentivie match and whether
// the character class is negated or not.
CharClass(Vec<(char, char)>, Flags),
// Matches any character except new lines.
// The flags indicate whether to include the '\n' character.
Any(Flags),
// Matches the beginning of the string, consumes no characters.
// The flags indicate whether it matches if the preceding character
// is a new line.
EmptyBegin(Flags),
// Matches the end of the string, consumes no characters.
// The flags indicate whether it matches if the proceding character
// is a new line.
EmptyEnd(Flags),
// Matches a word boundary (\w on one side and \W \A or \z on the other),
// and consumes no character.
// The flags indicate whether this matches a word boundary or something
// that isn't a word boundary.
EmptyWordBoundary(Flags),
// Saves the current position in the input string to the Nth save slot.
Save(uint),
// Jumps to the instruction at the index given.
Jump(InstIdx),
// Jumps to the instruction at the first index given. If that leads to
// a failing state, then the instruction at the second index given is
// tried.
Split(InstIdx, InstIdx),
}
/// Program represents a compiled regular expression. Once an expression is
/// compiled, its representation is immutable and will never change.
///
/// All of the data in a compiled expression is wrapped in "MaybeStatic" or
/// "MaybeOwned" types so that a `Program` can be represented as static data.
/// (This makes it convenient and efficient for use with the `regex!` macro.)
#[deriving(Clone)]
pub struct Program {
/// A sequence of instructions.
pub insts: Vec<Inst>,
/// If the regular expression requires a literal prefix in order to have a
/// match, that prefix is stored here. (It's used in the VM to implement
/// an optimization.)
pub prefix: ~str,
}
impl Program {
/// Compiles a Regex given its AST.
pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) {
let mut c = Compiler {
insts: Vec::with_capacity(100),
names: Vec::with_capacity(10),
};
c.insts.push(Save(0));
c.compile(ast);
c.insts.push(Save(1));
c.insts.push(Match);
// Try to discover a literal string prefix.
// This is a bit hacky since we have to skip over the initial
// 'Save' instruction.
let mut pre = StrBuf::with_capacity(5);
for i in iter::range(1, c.insts.len()) {
match *c.insts.get(i) {
OneChar(c, FLAG_EMPTY) => pre.push_char(c),
_ => break
}
}
let names = c.names.as_slice().into_owned();
let prog = Program {
insts: c.insts,
prefix: pre.into_owned(),
};
(prog, names)
}
/// Returns the total number of capture groups in the regular expression.
/// This includes the zeroth capture.
pub fn num_captures(&self) -> uint {
let mut n = 0;
for inst in self.insts.iter() {
match *inst {
Save(c) => n = cmp::max(n, c+1),
_ => {}
}
}
// There's exactly 2 Save slots for every capture.
n / 2
}
}
struct Compiler<'r> {
insts: Vec<Inst>,
names: Vec<Option<~str>>,
}
// The compiler implemented here is extremely simple. Most of the complexity
// in this crate is in the parser or the VM.
// The only tricky thing here is patching jump/split instructions to point to
// the right instruction.
impl<'r> Compiler<'r> {
fn compile(&mut self, ast: ~parse::Ast) {
match ast {
~Nothing => {},
~Literal(c, flags) => self.push(OneChar(c, flags)),
~Dot(nl) => self.push(Any(nl)),
~Class(ranges, flags) =>
self.push(CharClass(ranges, flags)),
~Begin(flags) => self.push(EmptyBegin(flags)),
~End(flags) => self.push(EmptyEnd(flags)),
~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
~Capture(cap, name, x) => {
let len = self.names.len();
if cap >= len {
self.names.grow(10 + cap - len, &None)
}
*self.names.get_mut(cap) = name;
self.push(Save(2 * cap));
self.compile(x);
self.push(Save(2 * cap + 1));
}
~Cat(xs) => {
for x in xs.move_iter() {
self.compile(x)
}
}
~Alt(x, y) => {
let split = self.empty_split(); // push: split 0, 0
let j1 = self.insts.len();
self.compile(x); // push: insts for x
let jmp = self.empty_jump(); // push: jmp 0
let j2 = self.insts.len();
self.compile(y); // push: insts for y
let j3 = self.insts.len();
self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2
self.set_jump(jmp, j3); // jmp 0 -> jmp j3
}
~Rep(x, ZeroOne, g) => {
let split = self.empty_split();
let j1 = self.insts.len();
self.compile(x);
let j2 = self.insts.len();
if g.is_greedy() {
self.set_split(split, j1, j2);
} else {
self.set_split(split, j2, j1);
}
}
~Rep(x, ZeroMore, g) => {
let j1 = self.insts.len();
let split = self.empty_split();
let j2 = self.insts.len();
self.compile(x);
let jmp = self.empty_jump();
let j3 = self.insts.len();
self.set_jump(jmp, j1);
if g.is_greedy() {
self.set_split(split, j2, j3);
} else {
self.set_split(split, j3, j2);
}
}
~Rep(x, OneMore, g) => {
let j1 = self.insts.len();
self.compile(x);
let split = self.empty_split();
let j2 = self.insts.len();
if g.is_greedy() {
self.set_split(split, j1, j2);
} else {
self.set_split(split, j2, j1);
}
}
}
}
/// Appends the given instruction to the program.
#[inline]
fn push(&mut self, x: Inst) {
self.insts.push(x)
}
/// Appends an *empty* `Split` instruction to the program and returns
/// the index of that instruction. (The index can then be used to "patch"
/// the actual locations of the split in later.)
#[inline]
fn empty_split(&mut self) -> InstIdx {
self.insts.push(Split(0, 0));
self.insts.len() - 1
}
/// Sets the left and right locations of a `Split` instruction at index
/// `i` to `pc1` and `pc2`, respectively.
/// If the instruction at index `i` isn't a `Split` instruction, then
/// `fail!` is called.
#[inline]
fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
let split = self.insts.get_mut(i);
match *split {
Split(_, _) => *split = Split(pc1, pc2),
_ => fail!("BUG: Invalid split index."),
}
}
/// Appends an *empty* `Jump` instruction to the program and returns the
/// index of that instruction.
#[inline]
fn empty_jump(&mut self) -> InstIdx {
self.insts.push(Jump(0));
self.insts.len() - 1
}
/// Sets the location of a `Jump` instruction at index `i` to `pc`.
/// If the instruction at index `i` isn't a `Jump` instruction, then
/// `fail!` is called.
#[inline]
fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
let jmp = self.insts.get_mut(i);
match *jmp {
Jump(_) => *jmp = Jump(pc),
_ => fail!("BUG: Invalid jump index."),
}
}
}

425
src/libregex/lib.rs Normal file
View File

@ -0,0 +1,425 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! This crate provides a native implementation of regular expressions that is
//! heavily based on RE2 both in syntax and in implementation. Notably,
//! backreferences and arbitrary lookahead/lookbehind assertions are not
//! provided. In return, regular expression searching provided by this package
//! has excellent worst case performance. The specific syntax supported is
//! documented further down.
//!
//! This crate's documentation provides some simple examples, describes Unicode
//! support and exhaustively lists the supported syntax. For more specific
//! details on the API, please see the documentation for the `Regex` type.
//!
//! # First example: find a date
//!
//! General use of regular expressions in this package involves compiling an
//! expression and then using it to search, split or replace text. For example,
//! to confirm that some text resembles a date:
//!
//! ```rust
//! use regex::Regex;
//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") {
//! Ok(re) => re,
//! Err(err) => fail!("{}", err),
//! };
//! assert_eq!(re.is_match("2014-01-01"), true);
//! ```
//!
//! Notice the use of the `^` and `$` anchors. In this crate, every expression
//! is executed with an implicit `.*?` at the beginning and end, which allows
//! it to match anywhere in the text. Anchors can be used to ensure that the
//! full text matches an expression.
//!
//! This example also demonstrates the utility of raw strings in Rust, which
//! are just like regular strings except they are prefixed with an `r` and do
//! not process any escape sequences. For example, `"\\d"` is the same
//! expression as `r"\d"`.
//!
//! # The `regex!` macro
//!
//! Rust's compile time meta-programming facilities provide a way to write a
//! `regex!` macro which compiles regular expressions *when your program
//! compiles*. Said differently, if you only use `regex!` to build regular
//! expressions in your program, then your program cannot compile with an
//! invalid regular expression. Moreover, the `regex!` macro compiles the
//! given expression to native Rust code, which makes it much faster for
//! searching text.
//!
//! Since `regex!` provides compiled regular expressions that are both safer
//! and faster to use, you should use them whenever possible. The only
//! requirement for using them is that you have a string literal corresponding
//! to your expression. Otherwise, it is indistinguishable from an expression
//! compiled at runtime with `Regex::new`.
//!
//! To use the `regex!` macro, you must enable the `phase` feature and import
//! the `regex_macros` crate as a syntax extension:
//!
//! ```rust
//! #![feature(phase)]
//! #[phase(syntax)]
//! extern crate regex_macros;
//! extern crate regex;
//!
//! fn main() {
//! let re = regex!(r"^\d{4}-\d{2}-\d{2}$");
//! assert_eq!(re.is_match("2014-01-01"), true);
//! }
//! ```
//!
//! There are a few things worth mentioning about using the `regex!` macro.
//! Firstly, the `regex!` macro *only* accepts string *literals*.
//! Secondly, the `regex` crate *must* be linked with the name `regex` since
//! the generated code depends on finding symbols in the `regex` crate.
//!
//! The only downside of using the `regex!` macro is that it can increase the
//! size of your program's binary since it generates specialized Rust code.
//! The extra size probably won't be significant for a small number of
//! expressions, but 100+ calls to `regex!` will probably result in a
//! noticeably bigger binary.
//!
//! # Example: iterating over capture groups
//!
//! This crate provides convenient iterators for matching an expression
//! repeatedly against a search string to find successive non-overlapping
//! matches. For example, to find all dates in a string and be able to access
//! them by their component pieces:
//!
//! ```rust
//! # #![feature(phase)]
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
//! # fn main() {
//! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})");
//! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
//! for cap in re.captures_iter(text) {
//! println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1));
//! }
//! // Output:
//! // Month: 03 Day: 14 Year: 2012
//! // Month: 01 Day: 01 Year: 2013
//! // Month: 07 Day: 05 Year: 2014
//! # }
//! ```
//!
//! Notice that the year is in the capture group indexed at `1`. This is
//! because the *entire match* is stored in the capture group at index `0`.
//!
//! # Example: replacement with named capture groups
//!
//! Building on the previous example, perhaps we'd like to rearrange the date
//! formats. This can be done with text replacement. But to make the code
//! clearer, we can *name* our capture groups and use those names as variables
//! in our replacement text:
//!
//! ```rust
//! # #![feature(phase)]
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
//! # fn main() {
//! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})");
//! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
//! let after = re.replace_all(before, "$m/$d/$y");
//! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014");
//! # }
//! ```
//!
//! The `replace` methods are actually polymorphic in the replacement, which
//! provides more flexibility than is seen here. (See the documentation for
//! `Regex::replace` for more details.)
//!
//! # Pay for what you use
//!
//! With respect to searching text with a regular expression, there are three
//! questions that can be asked:
//!
//! 1. Does the text match this expression?
//! 2. If so, where does it match?
//! 3. Where are the submatches?
//!
//! Generally speaking, this crate could provide a function to answer only #3,
//! which would subsume #1 and #2 automatically. However, it can be
//! significantly more expensive to compute the location of submatches, so it's
//! best not to do it if you don't need to.
//!
//! Therefore, only use what you need. For example, don't use `find` if you
//! only need to test if an expression matches a string. (Use `is_match`
//! instead.)
//!
//! # Unicode
//!
//! This implementation executes regular expressions **only** on sequences of
//! UTF8 codepoints while exposing match locations as byte indices.
//!
//! Currently, only naive case folding is supported. Namely, when matching
//! case insensitively, the characters are first converted to their uppercase
//! forms and then compared.
//!
//! Regular expressions themselves are also **only** interpreted as a sequence
//! of UTF8 codepoints. This means you can embed Unicode characters directly
//! into your expression:
//!
//! ```rust
//! # #![feature(phase)]
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
//! # fn main() {
//! let re = regex!(r"(?i)Δ+");
//! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
//! # }
//! ```
//!
//! Finally, Unicode general categories and scripts are available as character
//! classes. For example, you can match a sequence of numerals, Greek or
//! Cherokee letters:
//!
//! ```rust
//! # #![feature(phase)]
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
//! # fn main() {
//! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+");
//! assert_eq!(re.find("abcΔβγδⅡxyz"), Some((3, 23)));
//! # }
//! ```
//!
//! # Syntax
//!
//! The syntax supported in this crate is almost in an exact correspondence
//! with the syntax supported by RE2.
//!
//! ## Matching one character
//!
//! <pre class="rust">
//! . any character except new line (includes new line with s flag)
//! [xyz] A character class matching either x, y or z.
//! [^xyz] A character class matching any character except x, y and z.
//! [a-z] A character class matching any character in range a-z.
//! \d Perl character class ([0-9])
//! \D Negated Perl character class ([^0-9])
//! [:alpha:] ASCII character class ([A-Za-z])
//! [:^alpha:] Negated ASCII character class ([^A-Za-z])
//! \pN One letter name Unicode character class
//! \p{Greek} Unicode character class (general category or script)
//! \PN Negated one letter name Unicode character class
//! \P{Greek} negated Unicode character class (general category or script)
//! </pre>
//!
//! Any named character class may appear inside a bracketed `[...]` character
//! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
//! character.
//!
//! ## Composites
//!
//! <pre class="rust">
//! xy concatenation (x followed by y)
//! x|y alternation (x or y, prefer x)
//! </pre>
//!
//! ## Repetitions
//!
//! <pre class="rust">
//! x* zero or more of x (greedy)
//! x+ one or more of x (greedy)
//! x? zero or one of x (greedy)
//! x*? zero or more of x (ungreedy)
//! x+? one or more of x (ungreedy)
//! x?? zero or one of x (ungreedy)
//! x{n,m} at least n and at most x (greedy)
//! x{n,} at least n x (greedy)
//! x{n} exactly n x
//! x{n,m}? at least n and at most x (ungreedy)
//! x{n,}? at least n x (ungreedy)
//! x{n}? exactly n x
//! </pre>
//!
//! ## Empty matches
//!
//! <pre class="rust">
//! ^ the beginning of text (or start-of-line with multi-line mode)
//! $ the end of text (or end-of-line with multi-line mode)
//! \A only the beginning of text (even with multi-line mode enabled)
//! \z only the end of text (even with multi-line mode enabled)
//! \b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
//! \B not a Unicode word boundary
//! </pre>
//!
//! ## Grouping and flags
//!
//! <pre class="rust">
//! (exp) numbered capture group (indexed by opening parenthesis)
//! (?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
//! (?:exp) non-capturing group
//! (?flags) set flags within current group
//! (?flags:exp) set flags for exp (non-capturing)
//! </pre>
//!
//! Flags are each a single character. For example, `(?x)` sets the flag `x`
//! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
//! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
//! the `x` flag and clears the `y` flag.
//!
//! All flags are by default disabled. They are:
//!
//! <pre class="rust">
//! i case insensitive
//! m multi-line mode: ^ and $ match begin/end of line
//! s allow . to match \n
//! U swap the meaning of x* and x*?
//! </pre>
//!
//! Here's an example that matches case insensitively for only part of the
//! expression:
//!
//! ```rust
//! # #![feature(phase)]
//! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
//! # fn main() {
//! let re = regex!(r"(?i)a+(?-i)b+");
//! let cap = re.captures("AaAaAbbBBBb").unwrap();
//! assert_eq!(cap.at(0), "AaAaAbb");
//! # }
//! ```
//!
//! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
//! `b`.
//!
//! ## Escape sequences
//!
//! <pre class="rust">
//! \* literal *, works for any punctuation character: \.+*?()|[]{}^$
//! \a bell (\x07)
//! \f form feed (\x0C)
//! \t horizontal tab
//! \n new line
//! \r carriage return
//! \v vertical tab (\x0B)
//! \123 octal character code (up to three digits)
//! \x7F hex character code (exactly two digits)
//! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint
//! </pre>
//!
//! ## Perl character classes (Unicode friendly)
//!
//! <pre class="rust">
//! \d digit ([0-9] + \p{Nd})
//! \D not digit
//! \s whitespace ([\t\n\f\r ] + \p{Z})
//! \S not whitespace
//! \w word character ([0-9A-Za-z_] + \p{L})
//! \W not word character
//! </pre>
//!
//! ## ASCII character classes
//!
//! <pre class="rust">
//! [:alnum:] alphanumeric ([0-9A-Za-z])
//! [:alpha:] alphabetic ([A-Za-z])
//! [:ascii:] ASCII ([\x00-\x7F])
//! [:blank:] blank ([\t ])
//! [:cntrl:] control ([\x00-\x1F\x7F])
//! [:digit:] digits ([0-9])
//! [:graph:] graphical ([!-~])
//! [:lower:] lower case ([a-z])
//! [:print:] printable ([ -~])
//! [:punct:] punctuation ([!-/:-@[-`{-~])
//! [:space:] whitespace ([\t\n\v\f\r ])
//! [:upper:] upper case ([A-Z])
//! [:word:] word characters ([0-9A-Za-z_])
//! [:xdigit:] hex digit ([0-9A-Fa-f])
//! </pre>
//!
//! # Untrusted input
//!
//! There are two factors to consider here: untrusted regular expressions and
//! untrusted search text.
//!
//! Currently, there are no counter-measures in place to prevent a malicious
//! user from writing an expression that may use a lot of resources. One such
//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try
//! to repeat the `a` instruction `100^3` times. Essentially, this means it's
//! very easy for an attacker to exhaust your system's memory if they are
//! allowed to execute arbitrary regular expressions. A possible solution to
//! this is to impose a hard limit on the size of a compiled expression, but it
//! does not yet exist.
//!
//! The story is a bit better with untrusted search text, since this crate's
//! implementation provides `O(nm)` search where `n` is the number of
//! characters in the search text and `m` is the number of instructions in a
//! compiled expression.
#![crate_id = "regex#0.11-pre"]
#![crate_type = "rlib"]
#![crate_type = "dylib"]
#![experimental]
#![license = "MIT/ASL2"]
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
html_root_url = "http://static.rust-lang.org/doc/master")]
#![feature(macro_rules, phase)]
#![deny(missing_doc)]
extern crate collections;
#[cfg(test)]
extern crate stdtest = "test";
#[cfg(test)]
extern crate rand;
// During tests, this links with the `regex` crate so that the `regex!` macro
// can be tested.
#[cfg(test)]
extern crate regex;
pub use parse::Error;
pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
pub use re::{FindCaptures, FindMatches};
pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN};
pub use re::{quote, is_match};
mod compile;
mod parse;
mod re;
mod vm;
#[cfg(test)]
mod test;
/// The `program` module exists to support the `regex!` macro. Do not use.
#[doc(hidden)]
pub mod native {
// Exporting this stuff is bad form, but it's necessary for two reasons.
// Firstly, the `regex!` syntax extension is in a different crate and
// requires access to the representation of a regex (particularly the
// instruction set) in order to compile to native Rust. This could be
// mitigated if `regex!` was defined in the same crate, but this has
// undesirable consequences (such as requiring a dependency on
// `libsyntax`).
//
// Secondly, the code generated generated by `regex!` must *also* be able
// to access various functions in this crate to reduce code duplication
// and to provide a value with precisely the same `Regex` type in this
// crate. This, AFAIK, is impossible to mitigate.
//
// On the bright side, `rustdoc` lets us hide this from the public API
// documentation.
pub use compile::{
Program,
OneChar, CharClass, Any, Save, Jump, Split,
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
};
pub use parse::{
FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL,
FLAG_SWAP_GREED, FLAG_NEGATED,
};
pub use re::{Dynamic, Native};
pub use vm::{
MatchKind, Exists, Location, Submatches,
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
CharReader, find_prefix,
};
}

1028
src/libregex/parse.rs Normal file

File diff suppressed because it is too large Load Diff

870
src/libregex/re.rs Normal file
View File

@ -0,0 +1,870 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use collections::HashMap;
use std::fmt;
use std::from_str::from_str;
use std::str::{MaybeOwned, Owned, Slice};
use compile::Program;
use parse;
use vm;
use vm::{CaptureLocs, MatchKind, Exists, Location, Submatches};
/// Escapes all regular expression meta characters in `text` so that it may be
/// safely used in a regular expression as a literal string.
pub fn quote(text: &str) -> ~str {
let mut quoted = StrBuf::with_capacity(text.len());
for c in text.chars() {
if parse::is_punct(c) {
quoted.push_char('\\')
}
quoted.push_char(c);
}
quoted.into_owned()
}
/// Tests if the given regular expression matches somewhere in the text given.
///
/// If there was a problem compiling the regular expression, an error is
/// returned.
///
/// To find submatches, split or replace text, you'll need to compile an
/// expression first.
///
/// Note that you should prefer the `regex!` macro when possible. For example,
/// `regex!("...").is_match("...")`.
pub fn is_match(regex: &str, text: &str) -> Result<bool, parse::Error> {
Regex::new(regex).map(|r| r.is_match(text))
}
/// Regex is a compiled regular expression, represented as either a sequence
/// of bytecode instructions (dynamic) or as a specialized Rust function
/// (native). It can be used to search, split
/// or replace text. All searching is done with an implicit `.*?` at the
/// beginning and end of an expression. To force an expression to match the
/// whole string (or a prefix or a suffix), you must use an anchor like `^` or
/// `$` (or `\A` and `\z`).
///
/// While this crate will handle Unicode strings (whether in the regular
/// expression or in the search text), all positions returned are **byte
/// indices**. Every byte index is guaranteed to be at a UTF8 codepoint
/// boundary.
///
/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
/// compiled regular expression and text to search, respectively.
///
/// The only methods that allocate new strings are the string replacement
/// methods. All other methods (searching and splitting) return borrowed
/// pointers into the string given.
///
/// # Examples
///
/// Find the location of a US phone number:
///
/// ```rust
/// # use regex::Regex;
/// let re = match Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}") {
/// Ok(re) => re,
/// Err(err) => fail!("{}", err),
/// };
/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19)));
/// ```
///
/// You can also use the `regex!` macro to compile a regular expression when
/// you compile your program:
///
/// ```rust
/// #![feature(phase)]
/// extern crate regex;
/// #[phase(syntax)] extern crate regex_macros;
///
/// fn main() {
/// let re = regex!(r"\d+");
/// assert_eq!(re.find("123 abc"), Some((0, 3)));
/// }
/// ```
///
/// Given an incorrect regular expression, `regex!` will cause the Rust
/// compiler to produce a compile time error.
/// Note that `regex!` will compile the expression to native Rust code, which
/// makes it much faster when searching text.
/// More details about the `regex!` macro can be found in the `regex` crate
/// documentation.
#[deriving(Clone)]
#[allow(visible_private_types)]
pub struct Regex {
/// The representation of `Regex` is exported to support the `regex!`
/// syntax extension. Do not rely on it.
///
/// See the comments for the `program` module in `lib.rs` for a more
/// detailed explanation for what `regex!` requires.
#[doc(hidden)]
pub original: ~str,
#[doc(hidden)]
pub names: ~[Option<~str>],
#[doc(hidden)]
pub p: MaybeNative,
}
impl fmt::Show for Regex {
/// Shows the original regular expression.
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f.buf, "{}", self.original)
}
}
pub enum MaybeNative {
Dynamic(Program),
Native(fn(MatchKind, &str, uint, uint) -> Vec<Option<uint>>),
}
impl Clone for MaybeNative {
fn clone(&self) -> MaybeNative {
match *self {
Dynamic(ref p) => Dynamic(p.clone()),
Native(fp) => Native(fp),
}
}
}
impl Regex {
/// Compiles a dynamic regular expression. Once compiled, it can be
/// used repeatedly to search, split or replace text in a string.
///
/// When possible, you should prefer the `regex!` macro since it is
/// safer and always faster.
///
/// If an invalid expression is given, then an error is returned.
pub fn new(re: &str) -> Result<Regex, parse::Error> {
let ast = try!(parse::parse(re));
let (prog, names) = Program::new(ast);
Ok(Regex { original: re.to_owned(), names: names, p: Dynamic(prog) })
}
/// Returns true if and only if the regex matches the string given.
///
/// # Example
///
/// Test if some text contains at least one word with exactly 13
/// characters:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let text = "I categorically deny having triskaidekaphobia.";
/// let matched = regex!(r"\b\w{13}\b").is_match(text);
/// assert!(matched);
/// # }
/// ```
pub fn is_match(&self, text: &str) -> bool {
has_match(&exec(self, Exists, text))
}
/// Returns the start and end byte range of the leftmost-first match in
/// `text`. If no match exists, then `None` is returned.
///
/// Note that this should only be used if you want to discover the position
/// of the match. Testing the existence of a match is faster if you use
/// `is_match`.
///
/// # Example
///
/// Find the start and end location of every word with exactly 13
/// characters:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let text = "I categorically deny having triskaidekaphobia.";
/// let pos = regex!(r"\b\w{13}\b").find(text);
/// assert_eq!(pos, Some((2, 15)));
/// # }
/// ```
pub fn find(&self, text: &str) -> Option<(uint, uint)> {
let caps = exec(self, Location, text);
if has_match(&caps) {
Some((caps.get(0).unwrap(), caps.get(1).unwrap()))
} else {
None
}
}
/// Returns an iterator for each successive non-overlapping match in
/// `text`, returning the start and end byte indices with respect to
/// `text`.
///
/// # Example
///
/// Find the start and end location of the first word with exactly 13
/// characters:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let text = "Retroactively relinquishing remunerations is reprehensible.";
/// for pos in regex!(r"\b\w{13}\b").find_iter(text) {
/// println!("{}", pos);
/// }
/// // Output:
/// // (0, 13)
/// // (14, 27)
/// // (28, 41)
/// // (45, 58)
/// # }
/// ```
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
FindMatches {
re: self,
search: text,
last_end: 0,
last_match: None,
}
}
/// Returns the capture groups corresponding to the leftmost-first
/// match in `text`. Capture group `0` always corresponds to the entire
/// match. If no match is found, then `None` is returned.
///
/// You should only use `captures` if you need access to submatches.
/// Otherwise, `find` is faster for discovering the location of the overall
/// match.
///
/// # Examples
///
/// Say you have some text with movie names and their release years,
/// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
/// looking like that, while also extracting the movie name and its release
/// year separately.
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"'([^']+)'\s+\((\d{4})\)");
/// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
/// let caps = re.captures(text).unwrap();
/// assert_eq!(caps.at(1), "Citizen Kane");
/// assert_eq!(caps.at(2), "1941");
/// assert_eq!(caps.at(0), "'Citizen Kane' (1941)");
/// # }
/// ```
///
/// Note that the full match is at capture group `0`. Each subsequent
/// capture group is indexed by the order of its opening `(`.
///
/// We can make this example a bit clearer by using *named* capture groups:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
/// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
/// let caps = re.captures(text).unwrap();
/// assert_eq!(caps.name("title"), "Citizen Kane");
/// assert_eq!(caps.name("year"), "1941");
/// assert_eq!(caps.at(0), "'Citizen Kane' (1941)");
/// # }
/// ```
///
/// Here we name the capture groups, which we can access with the `name`
/// method. Note that the named capture groups are still accessible with
/// `at`.
///
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `at(0)`.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
let caps = exec(self, Submatches, text);
Captures::new(self, text, caps)
}
/// Returns an iterator over all the non-overlapping capture groups matched
/// in `text`. This is operationally the same as `find_iter` (except it
/// yields information about submatches).
///
/// # Example
///
/// We can use this to find all movie titles and their release years in
/// some text, where the movie is formatted like "'Title' (xxxx)":
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)");
/// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
/// for caps in re.captures_iter(text) {
/// println!("Movie: {}, Released: {}", caps.name("title"), caps.name("year"));
/// }
/// // Output:
/// // Movie: Citizen Kane, Released: 1941
/// // Movie: The Wizard of Oz, Released: 1939
/// // Movie: M, Released: 1931
/// # }
/// ```
pub fn captures_iter<'r, 't>(&'r self, text: &'t str)
-> FindCaptures<'r, 't> {
FindCaptures {
re: self,
search: text,
last_match: None,
last_end: 0,
}
}
/// Returns an iterator of substrings of `text` delimited by a match
/// of the regular expression.
/// Namely, each element of the iterator corresponds to text that *isn't*
/// matched by the regular expression.
///
/// This method will *not* copy the text given.
///
/// # Example
///
/// To split a string delimited by arbitrary amounts of spaces or tabs:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"[ \t]+");
/// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
/// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
/// # }
/// ```
pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
RegexSplits {
finder: self.find_iter(text),
last: 0,
}
}
/// Returns an iterator of at most `limit` substrings of `text` delimited
/// by a match of the regular expression. (A `limit` of `0` will return no
/// substrings.)
/// Namely, each element of the iterator corresponds to text that *isn't*
/// matched by the regular expression.
/// The remainder of the string that is not split will be the last element
/// in the iterator.
///
/// This method will *not* copy the text given.
///
/// # Example
///
/// Get the first two words in some text:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"\W+");
/// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
/// assert_eq!(fields, vec!("Hey", "How", "are you?"));
/// # }
/// ```
pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: uint)
-> RegexSplitsN<'r, 't> {
RegexSplitsN {
splits: self.split(text),
cur: 0,
limit: limit,
}
}
/// Replaces the leftmost-first match with the replacement provided.
/// The replacement can be a regular string (where `$N` and `$name` are
/// expanded to match capture groups) or a function that takes the matches'
/// `Captures` and returns the replaced string.
///
/// If no match is found, then a copy of the string is returned unchanged.
///
/// # Examples
///
/// Note that this function is polymorphic with respect to the replacement.
/// In typical usage, this can just be a normal string:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!("[^01]+");
/// assert_eq!(re.replace("1078910", "").as_slice(), "1010");
/// # }
/// ```
///
/// But anything satisfying the `Replacer` trait will work. For example,
/// a closure of type `|&Captures| -> ~str` provides direct access to the
/// captures corresponding to a match. This allows one to access
/// submatches easily:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # use regex::Captures; fn main() {
/// let re = regex!(r"([^,\s]+),\s+(\S+)");
/// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
/// format!("{} {}", caps.at(2), caps.at(1))
/// });
/// assert_eq!(result.as_slice(), "Bruce Springsteen");
/// # }
/// ```
///
/// But this is a bit cumbersome to use all the time. Instead, a simple
/// syntax is supported that expands `$name` into the corresponding capture
/// group. Here's the last example, but using this expansion technique
/// with named capture groups:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// let re = regex!(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)");
/// let result = re.replace("Springsteen, Bruce", "$first $last");
/// assert_eq!(result.as_slice(), "Bruce Springsteen");
/// # }
/// ```
///
/// Note that using `$2` instead of `$first` or `$1` instead of `$last`
/// would produce the same result. To write a literal `$` use `$$`.
///
/// Finally, sometimes you just want to replace a literal string with no
/// submatch expansion. This can be done by wrapping a string with
/// `NoExpand`:
///
/// ```rust
/// # #![feature(phase)]
/// # extern crate regex; #[phase(syntax)] extern crate regex_macros;
/// # fn main() {
/// use regex::NoExpand;
///
/// let re = regex!(r"(?P<last>[^,\s]+),\s+(\S+)");
/// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
/// assert_eq!(result.as_slice(), "$2 $last");
/// # }
/// ```
pub fn replace<R: Replacer>(&self, text: &str, rep: R) -> StrBuf {
self.replacen(text, 1, rep)
}
/// Replaces all non-overlapping matches in `text` with the
/// replacement provided. This is the same as calling `replacen` with
/// `limit` set to `0`.
///
/// See the documentation for `replace` for details on how to access
/// submatches in the replacement string.
pub fn replace_all<R: Replacer>(&self, text: &str, rep: R) -> StrBuf {
self.replacen(text, 0, rep)
}
/// Replaces at most `limit` non-overlapping matches in `text` with the
/// replacement provided. If `limit` is 0, then all non-overlapping matches
/// are replaced.
///
/// See the documentation for `replace` for details on how to access
/// submatches in the replacement string.
pub fn replacen<R: Replacer>
(&self, text: &str, limit: uint, mut rep: R) -> StrBuf {
let mut new = StrBuf::with_capacity(text.len());
let mut last_match = 0u;
let mut i = 0;
for cap in self.captures_iter(text) {
// It'd be nicer to use the 'take' iterator instead, but it seemed
// awkward given that '0' => no limit.
if limit > 0 && i >= limit {
break
}
i += 1;
let (s, e) = cap.pos(0).unwrap(); // captures only reports matches
new.push_str(text.slice(last_match, s));
new.push_str(rep.reg_replace(&cap).as_slice());
last_match = e;
}
new.append(text.slice(last_match, text.len()))
}
}
/// NoExpand indicates literal string replacement.
///
/// It can be used with `replace` and `replace_all` to do a literal
/// string replacement without expanding `$name` to their corresponding
/// capture groups.
///
/// `'r` is the lifetime of the literal text.
pub struct NoExpand<'t>(pub &'t str);
/// Replacer describes types that can be used to replace matches in a string.
pub trait Replacer {
/// Returns a possibly owned string that is used to replace the match
/// corresponding the the `caps` capture group.
///
/// The `'a` lifetime refers to the lifetime of a borrowed string when
/// a new owned string isn't needed (e.g., for `NoExpand`).
fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a>;
}
impl<'t> Replacer for NoExpand<'t> {
fn reg_replace<'a>(&'a mut self, _: &Captures) -> MaybeOwned<'a> {
let NoExpand(s) = *self;
Slice(s)
}
}
impl<'t> Replacer for &'t str {
fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a> {
Owned(caps.expand(*self).into_owned())
}
}
impl<'a> Replacer for |&Captures|: 'a -> ~str {
fn reg_replace<'r>(&'r mut self, caps: &Captures) -> MaybeOwned<'r> {
Owned((*self)(caps).into_owned())
}
}
/// Yields all substrings delimited by a regular expression match.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the string being split.
pub struct RegexSplits<'r, 't> {
finder: FindMatches<'r, 't>,
last: uint,
}
impl<'r, 't> Iterator<&'t str> for RegexSplits<'r, 't> {
fn next(&mut self) -> Option<&'t str> {
let text = self.finder.search;
match self.finder.next() {
None => {
if self.last >= text.len() {
None
} else {
let s = text.slice(self.last, text.len());
self.last = text.len();
Some(s)
}
}
Some((s, e)) => {
let matched = text.slice(self.last, s);
self.last = e;
Some(matched)
}
}
}
}
/// Yields at most `N` substrings delimited by a regular expression match.
///
/// The last substring will be whatever remains after splitting.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the string being split.
pub struct RegexSplitsN<'r, 't> {
splits: RegexSplits<'r, 't>,
cur: uint,
limit: uint,
}
impl<'r, 't> Iterator<&'t str> for RegexSplitsN<'r, 't> {
fn next(&mut self) -> Option<&'t str> {
let text = self.splits.finder.search;
if self.cur >= self.limit {
None
} else {
self.cur += 1;
if self.cur >= self.limit {
Some(text.slice(self.splits.last, text.len()))
} else {
self.splits.next()
}
}
}
}
/// Captures represents a group of captured strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
/// index corresponds to the next capture group in the regex.
/// If a capture group is named, then the matched string is *also* available
/// via the `name` method. (Note that the 0th capture is always unnamed and so
/// must be accessed with the `at` method.)
///
/// Positions returned from a capture group are always byte indices.
///
/// `'t` is the lifetime of the matched text.
pub struct Captures<'t> {
text: &'t str,
locs: CaptureLocs,
named: Option<HashMap<~str, uint>>,
}
impl<'t> Captures<'t> {
fn new(re: &Regex, search: &'t str, locs: CaptureLocs)
-> Option<Captures<'t>> {
if !has_match(&locs) {
return None
}
let named =
if re.names.len() == 0 {
None
} else {
let mut named = HashMap::new();
for (i, name) in re.names.iter().enumerate() {
match name {
&None => {},
&Some(ref name) => {
named.insert(name.to_owned(), i);
}
}
}
Some(named)
};
Some(Captures {
text: search,
locs: locs,
named: named,
})
}
/// Returns the start and end positions of the Nth capture group.
/// Returns `None` if `i` is not a valid capture group or if the capture
/// group did not match anything.
/// The positions returned are *always* byte indices with respect to the
/// original string matched.
pub fn pos(&self, i: uint) -> Option<(uint, uint)> {
let (s, e) = (i * 2, i * 2 + 1);
if e >= self.locs.len() || self.locs.get(s).is_none() {
// VM guarantees that each pair of locations are both Some or None.
return None
}
Some((self.locs.get(s).unwrap(), self.locs.get(e).unwrap()))
}
/// Returns the matched string for the capture group `i`.
/// If `i` isn't a valid capture group or didn't match anything, then the
/// empty string is returned.
pub fn at(&self, i: uint) -> &'t str {
match self.pos(i) {
None => "",
Some((s, e)) => {
self.text.slice(s, e)
}
}
}
/// Returns the matched string for the capture group named `name`.
/// If `name` isn't a valid capture group or didn't match anything, then
/// the empty string is returned.
pub fn name(&self, name: &str) -> &'t str {
match self.named {
None => "",
Some(ref h) => {
match h.find_equiv(&name) {
None => "",
Some(i) => self.at(*i),
}
}
}
}
/// Creates an iterator of all the capture groups in order of appearance
/// in the regular expression.
pub fn iter(&'t self) -> SubCaptures<'t> {
SubCaptures { idx: 0, caps: self, }
}
/// Creates an iterator of all the capture group positions in order of
/// appearance in the regular expression. Positions are byte indices
/// in terms of the original string matched.
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
SubCapturesPos { idx: 0, caps: self, }
}
/// Expands all instances of `$name` in `text` to the corresponding capture
/// group `name`.
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist or
/// isn't a valid index), then it is replaced with the empty string.
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, text: &str) -> StrBuf {
// How evil can you get?
// FIXME: Don't use regexes for this. It's completely unnecessary.
let re = Regex::new(r"(^|[^$]|\b)\$(\w+)").unwrap();
let text = re.replace_all(text, |refs: &Captures| -> ~str {
let (pre, name) = (refs.at(1), refs.at(2));
pre + match from_str::<uint>(name) {
None => self.name(name).to_owned(),
Some(i) => self.at(i).to_owned(),
}
});
let re = Regex::new(r"\$\$").unwrap();
re.replace_all(text.as_slice(), NoExpand("$"))
}
}
impl<'t> Container for Captures<'t> {
/// Returns the number of captured groups.
#[inline]
fn len(&self) -> uint {
self.locs.len() / 2
}
}
/// An iterator over capture groups for a particular match of a regular
/// expression.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCaptures<'t> {
idx: uint,
caps: &'t Captures<'t>,
}
impl<'t> Iterator<&'t str> for SubCaptures<'t> {
fn next(&mut self) -> Option<&'t str> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.at(self.idx - 1))
} else {
None
}
}
}
/// An iterator over capture group positions for a particular match of a
/// regular expression.
///
/// Positions are byte indices in terms of the original string matched.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCapturesPos<'t> {
idx: uint,
caps: &'t Captures<'t>,
}
impl<'t> Iterator<Option<(uint, uint)>> for SubCapturesPos<'t> {
fn next(&mut self) -> Option<Option<(uint, uint)>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.pos(self.idx - 1))
} else {
None
}
}
}
/// An iterator that yields all non-overlapping capture groups matching a
/// particular regular expression. The iterator stops when no more matches can
/// be found.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the matched string.
pub struct FindCaptures<'r, 't> {
re: &'r Regex,
search: &'t str,
last_match: Option<uint>,
last_end: uint,
}
impl<'r, 't> Iterator<Captures<'t>> for FindCaptures<'r, 't> {
fn next(&mut self) -> Option<Captures<'t>> {
if self.last_end > self.search.len() {
return None
}
let caps = exec_slice(self.re, Submatches, self.search,
self.last_end, self.search.len());
let (s, e) =
if !has_match(&caps) {
return None
} else {
(caps.get(0).unwrap(), caps.get(1).unwrap())
};
// Don't accept empty matches immediately following a match.
// i.e., no infinite loops please.
if e - s == 0 && Some(self.last_end) == self.last_match {
self.last_end += 1;
return self.next()
}
self.last_end = e;
self.last_match = Some(self.last_end);
Captures::new(self.re, self.search, caps)
}
}
/// An iterator over all non-overlapping matches for a particular string.
///
/// The iterator yields a tuple of integers corresponding to the start and end
/// of the match. The indices are byte offsets. The iterator stops when no more
/// matches can be found.
///
/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
/// of the matched string.
pub struct FindMatches<'r, 't> {
re: &'r Regex,
search: &'t str,
last_match: Option<uint>,
last_end: uint,
}
impl<'r, 't> Iterator<(uint, uint)> for FindMatches<'r, 't> {
fn next(&mut self) -> Option<(uint, uint)> {
if self.last_end > self.search.len() {
return None
}
let caps = exec_slice(self.re, Location, self.search,
self.last_end, self.search.len());
let (s, e) =
if !has_match(&caps) {
return None
} else {
(caps.get(0).unwrap(), caps.get(1).unwrap())
};
// Don't accept empty matches immediately following a match.
// i.e., no infinite loops please.
if e - s == 0 && Some(self.last_end) == self.last_match {
self.last_end += 1;
return self.next()
}
self.last_end = e;
self.last_match = Some(self.last_end);
Some((s, e))
}
}
fn exec(re: &Regex, which: MatchKind, input: &str) -> CaptureLocs {
exec_slice(re, which, input, 0, input.len())
}
fn exec_slice(re: &Regex, which: MatchKind,
input: &str, s: uint, e: uint) -> CaptureLocs {
match re.p {
Dynamic(ref prog) => vm::run(which, prog, input, s, e),
Native(exec) => exec(which, input, s, e),
}
}
#[inline]
fn has_match(caps: &CaptureLocs) -> bool {
caps.len() >= 2 && caps.get(0).is_some() && caps.get(1).is_some()
}

179
src/libregex/test/bench.rs Normal file
View File

@ -0,0 +1,179 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use rand::{Rng, task_rng};
use stdtest::Bencher;
use std::str;
use regex::{Regex, NoExpand};
fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) {
b.iter(|| if !re.is_match(text) { fail!("no match") });
}
#[bench]
fn no_exponential(b: &mut Bencher) {
let n = 100;
let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap();
let text = "a".repeat(n);
bench_assert_match(b, re, text);
}
#[bench]
fn literal(b: &mut Bencher) {
let re = regex!("y");
let text = "x".repeat(50) + "y";
bench_assert_match(b, re, text);
}
#[bench]
fn not_literal(b: &mut Bencher) {
let re = regex!(".y");
let text = "x".repeat(50) + "y";
bench_assert_match(b, re, text);
}
#[bench]
fn match_class(b: &mut Bencher) {
let re = regex!("[abcdw]");
let text = "xxxx".repeat(20) + "w";
bench_assert_match(b, re, text);
}
#[bench]
fn match_class_in_range(b: &mut Bencher) {
// 'b' is between 'a' and 'c', so the class range checking doesn't help.
let re = regex!("[ac]");
let text = "bbbb".repeat(20) + "c";
bench_assert_match(b, re, text);
}
#[bench]
fn replace_all(b: &mut Bencher) {
let re = regex!("[cjrw]");
let text = "abcdefghijklmnopqrstuvwxyz";
// FIXME: This isn't using the $name expand stuff.
// It's possible RE2/Go is using it, but currently, the expand in this
// crate is actually compiling a regex, so it's incredibly slow.
b.iter(|| re.replace_all(text, NoExpand("")));
}
#[bench]
fn anchored_literal_short_non_match(b: &mut Bencher) {
let re = regex!("^zbc(d|e)");
let text = "abcdefghijklmnopqrstuvwxyz";
b.iter(|| re.is_match(text));
}
#[bench]
fn anchored_literal_long_non_match(b: &mut Bencher) {
let re = regex!("^zbc(d|e)");
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
b.iter(|| re.is_match(text));
}
#[bench]
fn anchored_literal_short_match(b: &mut Bencher) {
let re = regex!("^.bc(d|e)");
let text = "abcdefghijklmnopqrstuvwxyz";
b.iter(|| re.is_match(text));
}
#[bench]
fn anchored_literal_long_match(b: &mut Bencher) {
let re = regex!("^.bc(d|e)");
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_short_a(b: &mut Bencher) {
let re = regex!("^.bc(d|e)*$");
let text = "abcddddddeeeededd";
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_short_a_not(b: &mut Bencher) {
let re = regex!(".bc(d|e)*$");
let text = "abcddddddeeeededd";
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_short_b(b: &mut Bencher) {
let re = regex!("^.bc(?:d|e)*$");
let text = "abcddddddeeeededd";
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_short_b_not(b: &mut Bencher) {
let re = regex!(".bc(?:d|e)*$");
let text = "abcddddddeeeededd";
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_long_prefix(b: &mut Bencher) {
let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$");
let text = "abcdefghijklmnopqrstuvwxyz";
b.iter(|| re.is_match(text));
}
#[bench]
fn one_pass_long_prefix_not(b: &mut Bencher) {
let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$");
let text = "abcdefghijklmnopqrstuvwxyz";
b.iter(|| re.is_match(text));
}
macro_rules! throughput(
($name:ident, $regex:expr, $size:expr) => (
#[bench]
fn $name(b: &mut Bencher) {
let text = gen_text($size);
b.bytes = $size;
b.iter(|| if $regex.is_match(text) { fail!("match") });
}
);
)
fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") }
fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
fn gen_text(n: uint) -> ~str {
let mut rng = task_rng();
let mut bytes = rng.gen_ascii_str(n).into_bytes();
for (i, b) in bytes.mut_iter().enumerate() {
if i % 20 == 0 {
*b = '\n' as u8
}
}
str::from_utf8(bytes).unwrap().to_owned()
}
throughput!(easy0_32, easy0(), 32)
throughput!(easy0_1K, easy0(), 1<<10)
throughput!(easy0_32K, easy0(), 32<<10)
throughput!(easy1_32, easy1(), 32)
throughput!(easy1_1K, easy1(), 1<<10)
throughput!(easy1_32K, easy1(), 32<<10)
throughput!(medium_32, medium(), 32)
throughput!(medium_1K, medium(), 1<<10)
throughput!(medium_32K,medium(), 32<<10)
throughput!(hard_32, hard(), 32)
throughput!(hard_1K, hard(), 1<<10)
throughput!(hard_32K,hard(), 32<<10)

View File

@ -0,0 +1,373 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// ignore-tidy-linelength
// DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests'
// on 2014-04-23 01:33:36.539280.
// Tests from basic.dat
mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18)))
mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7)))
mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8)))
mat!(match_basic_6, r"\)", r"()", Some((1, 2)))
mat!(match_basic_7, r"a]", r"a]a", Some((0, 2)))
mat!(match_basic_9, r"\}", r"}", Some((0, 1)))
mat!(match_basic_10, r"\]", r"]", Some((0, 1)))
mat!(match_basic_12, r"]", r"]", Some((0, 1)))
mat!(match_basic_15, r"^a", r"ax", Some((0, 1)))
mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3)))
mat!(match_basic_17, r"a\^", r"a^", Some((0, 2)))
mat!(match_basic_18, r"a$", r"aa", Some((1, 2)))
mat!(match_basic_19, r"a\$", r"a$", Some((0, 2)))
mat!(match_basic_20, r"^$", r"", Some((0, 0)))
mat!(match_basic_21, r"$^", r"", Some((0, 0)))
mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2)))
mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1)))
mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0)))
mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4)))
mat!(match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3)))
mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2)))
mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2)))
mat!(match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
mat!(match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
mat!(match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)))
mat!(match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4)))
mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4)))
mat!(match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2)))
mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1)))
mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1)))
mat!(match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2)))
mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2)))
mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2)))
mat!(match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2)))
mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2)))
mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8)))
mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9)))
mat!(match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2)))
mat!(match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2)))
mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3)))
mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4)))
mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4)))
mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3)))
mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3)))
mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4)))
mat!(match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17)))
mat!(match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17)))
mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1)))
mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3)))
mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3)))
mat!(match_basic_65, r"
", r"
", Some((0, 1)))
mat!(match_basic_66, r"
", r"
", Some((0, 1)))
mat!(match_basic_67, r"[^a]", r"
", Some((0, 1)))
mat!(match_basic_68, r"
a", r"
a", Some((0, 2)))
mat!(match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3)))
mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3)))
mat!(match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6)))
mat!(match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3)))
mat!(match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11)))
mat!(match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1)))
mat!(match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2)))
mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81)))
mat!(match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25)))
mat!(match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22)))
mat!(match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11)))
mat!(match_basic_80, r".*", r"", Some((0, 2)))
mat!(match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57)))
mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10)))
mat!(match_basic_84, r"^", r"", Some((0, 0)))
mat!(match_basic_85, r"$", r"", Some((0, 0)))
mat!(match_basic_86, r"^$", r"", Some((0, 0)))
mat!(match_basic_87, r"^a$", r"a", Some((0, 1)))
mat!(match_basic_88, r"abc", r"abc", Some((0, 3)))
mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4)))
mat!(match_basic_90, r"abc", r"ababc", Some((2, 5)))
mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3)))
mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3)))
mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4)))
mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6)))
mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4)))
mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6)))
mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4)))
mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3)))
mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3)))
mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3)))
mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3)))
mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4)))
mat!(match_basic_103, r"^", r"abc", Some((0, 0)))
mat!(match_basic_104, r"$", r"abc", Some((3, 3)))
mat!(match_basic_105, r"a.c", r"abc", Some((0, 3)))
mat!(match_basic_106, r"a.c", r"axc", Some((0, 3)))
mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5)))
mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3)))
mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3)))
mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3)))
mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2)))
mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2)))
mat!(match_basic_113, r"a]", r"a]", Some((0, 2)))
mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3)))
mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3)))
mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3)))
mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3)))
mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2)))
mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2)))
mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3)))
mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2)))
mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4)))
mat!(match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1)))
mat!(match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3)))
mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7)))
mat!(match_basic_126, r"a*", r"aaa", Some((0, 3)))
mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None)
mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0)))
mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None)
mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2)))
mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2)))
mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1)))
mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3)))
mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None)
mat!(match_basic_138, r"a*", r"", Some((0, 0)))
mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5)))
mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1)))
mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1)))
mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1)))
mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None)
mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7)))
mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3)))
mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2)))
mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4)))
mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3)))
mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2)))
mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1)))
mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3)))
mat!(match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
mat!(match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
mat!(match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4)))
mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7)))
mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2)))
mat!(match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4)))
mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5)))
mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3)))
mat!(match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5)))
mat!(match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2)))
mat!(match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6)))
mat!(match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)))
mat!(match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14)))
mat!(match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5)))
mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4)))
mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3)))
mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3)))
mat!(match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12)))
mat!(match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12)))
mat!(match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12)))
mat!(match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11)))
mat!(match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12)))
mat!(match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11)))
mat!(match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15)))
mat!(match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12)))
mat!(match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11)))
mat!(match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13)))
mat!(match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12)))
mat!(match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12)))
mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4)))
mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4)))
mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4)))
mat!(match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19)))
mat!(match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
mat!(match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11)))
mat!(match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3)))
mat!(match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7)))
mat!(match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
mat!(match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
mat!(match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
mat!(match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
mat!(match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
mat!(match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7)))
mat!(match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
mat!(match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
mat!(match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
mat!(match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
mat!(match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
mat!(match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
mat!(match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
mat!(match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3)))
mat!(match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
mat!(match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4)))
mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4)))
mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4)))
mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4)))
mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4)))
mat!(match_basic_221, r"\\000", r"\000", Some((0, 4)))
// Tests from nullsubexpr.dat
mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None)
mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0)))
mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0)))
mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_17, r"(a+)+", r"x", None)
mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None)
mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0)))
mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None)
mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5)))
mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1)))
mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None)
mat!(match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6)))
mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None)
mat!(match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2)))
mat!(match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1)))
mat!(match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
mat!(match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
mat!(match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
mat!(match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
mat!(match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
mat!(match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
mat!(match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2)))
mat!(match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2)))
// Tests from repetition.dat
mat!(match_repetition_10, r"((..)|(.))", r"", None)
mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None)
mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None)
mat!(match_repetition_14, r"((..)|(.)){1}", r"", None)
mat!(match_repetition_15, r"((..)|(.)){2}", r"", None)
mat!(match_repetition_16, r"((..)|(.)){3}", r"", None)
mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0)))
mat!(match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None)
mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None)
mat!(match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None)
mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None)
mat!(match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
mat!(match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)))
mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None)
mat!(match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2)))
mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None)
mat!(match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)))
mat!(match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3)))
mat!(match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
mat!(match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3)))
mat!(match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
mat!(match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4)))
mat!(match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4)))
mat!(match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5)))
mat!(match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
mat!(match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
mat!(match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None)
mat!(match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
mat!(match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
mat!(match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
mat!(match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
mat!(match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
mat!(match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
mat!(match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
mat!(match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None)
mat!(match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
mat!(match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None)
mat!(match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
mat!(match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None)
mat!(match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None)
mat!(match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
mat!(match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))

29
src/libregex/test/mod.rs Normal file
View File

@ -0,0 +1,29 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#[cfg(not(stage1))]
#[phase(syntax)]
extern crate regex_macros;
// Dirty hack: During stage1, test dynamic regexs. For stage2, we test
// native regexs.
#[cfg(stage1)]
macro_rules! regex(
($re:expr) => (
match ::regex::Regex::new($re) {
Ok(re) => re,
Err(err) => fail!("{}", err),
}
);
)
mod bench;
mod tests;

199
src/libregex/test/tests.rs Normal file
View File

@ -0,0 +1,199 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// ignore-tidy-linelength
use regex::{Regex, NoExpand};
#[test]
fn splitn() {
let re = regex!(r"\d+");
let text = "cauchy123plato456tyler789binx";
let subs: Vec<&str> = re.splitn(text, 2).collect();
assert_eq!(subs, vec!("cauchy", "plato456tyler789binx"));
}
#[test]
fn split() {
let re = regex!(r"\d+");
let text = "cauchy123plato456tyler789binx";
let subs: Vec<&str> = re.split(text).collect();
assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx"));
}
macro_rules! replace(
($name:ident, $which:ident, $re:expr,
$search:expr, $replace:expr, $result:expr) => (
#[test]
fn $name() {
let re = regex!($re);
assert_eq!(re.$which($search, $replace), StrBuf::from_str($result));
}
);
)
replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6")
replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z")
replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ")
replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1")
replace!(rep_double_dollar, replace,
r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1")
replace!(rep_no_expand, replace,
r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1")
replace!(rep_named, replace_all,
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
"w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3")
replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
"", "trim me")
macro_rules! noparse(
($name:ident, $re:expr) => (
#[test]
fn $name() {
let re = $re;
match Regex::new(re) {
Err(_) => {},
Ok(_) => fail!("Regex '{}' should cause a parse error.", re),
}
}
);
)
noparse!(fail_double_repeat, "a**")
noparse!(fail_no_repeat_arg, "*")
noparse!(fail_no_repeat_arg_begin, "^*")
noparse!(fail_incomplete_escape, "\\")
noparse!(fail_class_incomplete, "[A-")
noparse!(fail_class_not_closed, "[A")
noparse!(fail_class_no_begin, r"[\A]")
noparse!(fail_class_no_end, r"[\z]")
noparse!(fail_class_no_boundary, r"[\b]")
noparse!(fail_open_paren, "(")
noparse!(fail_close_paren, ")")
noparse!(fail_invalid_range, "[a-Z]")
noparse!(fail_empty_capture_name, "(?P<>a)")
noparse!(fail_empty_capture_exp, "(?P<name>)")
noparse!(fail_bad_capture_name, "(?P<na-me>)")
noparse!(fail_bad_flag, "(?a)a")
noparse!(fail_empty_alt_before, "|a")
noparse!(fail_empty_alt_after, "a|")
noparse!(fail_counted_big_exact, "a{1001}")
noparse!(fail_counted_big_min, "a{1001,}")
noparse!(fail_counted_no_close, "a{1001")
noparse!(fail_unfinished_cap, "(?")
noparse!(fail_unfinished_escape, "\\")
noparse!(fail_octal_digit, r"\8")
noparse!(fail_hex_digit, r"\xG0")
noparse!(fail_hex_short, r"\xF")
noparse!(fail_hex_long_digits, r"\x{fffg}")
noparse!(fail_flag_bad, "(?a)")
noparse!(fail_flag_empty, "(?)")
noparse!(fail_double_neg, "(?-i-i)")
noparse!(fail_neg_empty, "(?i-)")
noparse!(fail_empty_group, "()")
noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)")
macro_rules! mat(
($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
#[test]
fn $name() {
let text = $text;
let expected: Vec<Option<(uint, uint)>> = vec!($($loc)+);
let r = regex!($re);
let got = match r.captures(text) {
Some(c) => c.iter_pos().collect::<Vec<Option<(uint, uint)>>>(),
None => vec!(None),
};
// The test set sometimes leave out capture groups, so truncate
// actual capture groups to match test set.
let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice());
if sgot.len() > sexpect.len() {
sgot = sgot.slice(0, sexpect.len())
}
if sexpect != sgot {
fail!("For RE '{}' against '{}', expected '{}' but got '{}'",
$re, text, sexpect, sgot);
}
}
);
)
// Some crazy expressions from regular-expressions.info.
mat!(match_ranges,
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
"num: 255", Some((5, 8)))
mat!(match_ranges_not,
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
"num: 256", None)
mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)))
mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)))
mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)))
mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None)
mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
"mine is jam.slam@gmail.com ", Some((8, 26)))
mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
"mine is jam.slam@gmail ", None)
mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
"mine is jam.slam@gmail.com ", Some((8, 26)))
mat!(match_date1,
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-01-01", Some((0, 10)))
mat!(match_date2,
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-00-01", None)
mat!(match_date3,
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-13-01", None)
// Exercise the flags.
mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3)))
mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3)))
mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None)
mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2)))
mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4)))
mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None)
mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2)))
mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11)))
mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)))
mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)))
mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)))
// Some Unicode tests.
mat!(uni_literal, r"", "", Some((0, 3)))
mat!(uni_one, r"\pN", "", Some((0, 3)))
mat!(uni_mixed, r"\pN+", "1Ⅱ2", Some((0, 8)))
mat!(uni_not, r"\PN+", "ab", Some((0, 2)))
mat!(uni_not_class, r"[\PN]+", "ab", Some((0, 2)))
mat!(uni_not_class_neg, r"[^\PN]+", "ab", Some((2, 5)))
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)))
mat!(uni_case_not, r"Δ", "δ", None)
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)))
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)))
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)))
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
// Test the Unicode friendliness of Perl character classes.
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
mat!(uni_perl_w_not, r"\w+", "", None)
mat!(uni_perl_w_neg, r"\W+", "", Some((0, 3)))
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
mat!(uni_perl_d_not, r"\d+", "", None)
mat!(uni_perl_d_neg, r"\D+", "", Some((0, 3)))
mat!(uni_perl_s, r"\s+", "", Some((0, 3)))
mat!(uni_perl_s_not, r"\s+", "", None)
mat!(uni_perl_s_neg, r"\S+", "", Some((0, 3)))
// And do the same for word boundaries.
mat!(uni_boundary_none, r"\d\b", "", None)
mat!(uni_boundary_ogham, r"\d\b", "6", Some((0, 1)))
// A whole mess of tests from Glenn Fowler's regex test suite.
// Generated by the 'src/etc/regex-match-tests' program.
mod matches;

19
src/libregex/testdata/LICENSE vendored Normal file
View File

@ -0,0 +1,19 @@
The following license covers testregex.c and all associated test data.
Permission is hereby granted, free of charge, to any person obtaining a
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of the
Software, and to permit persons to whom the Software is furnished to do
so, subject to the following disclaimer:
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

17
src/libregex/testdata/README vendored Normal file
View File

@ -0,0 +1,17 @@
Test data was taken from the Go distribution, which was in turn taken from the
testregex test suite:
http://www2.research.att.com/~astopen/testregex/testregex.html
The LICENSE in this directory corresponds to the LICENSE that the data was
released under.
The tests themselves were modified for RE2/Go. A couple were modified further
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
have been a bad idea, but I think being consistent with an established Regex
library is worth something.
Note that these files are read by 'src/etc/regexp-match-tests' and turned into
Rust tests found in 'src/libregexp/tests/matches.rs'.

221
src/libregex/testdata/basic.dat vendored Normal file
View File

@ -0,0 +1,221 @@
NOTE all standard compliant implementations should pass these : 2002-05-31
BE abracadabra$ abracadabracadabra (7,18)
BE a...b abababbb (2,7)
BE XXXXXX ..XXXXXX (2,8)
E \) () (1,2)
BE a] a]a (0,2)
B } } (0,1)
E \} } (0,1)
BE \] ] (0,1)
B ] ] (0,1)
E ] ] (0,1)
B { { (0,1)
B } } (0,1)
BE ^a ax (0,1)
BE \^a a^a (1,3)
BE a\^ a^ (0,2)
BE a$ aa (1,2)
BE a\$ a$ (0,2)
BE ^$ NULL (0,0)
E $^ NULL (0,0)
E a($) aa (1,2)(2,2)
E a*(^a) aa (0,1)(0,1)
E (..)*(...)* a (0,0)
E (..)*(...)* abcd (0,4)(2,4)
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
E (ab)c|abc abc (0,3)(0,2)
E a{0}b ab (1,2)
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E a{9876543210} NULL BADBR
E ((a|a)|a) a (0,1)(0,1)(0,1)
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
E a*(a.|aa) aaaa (0,4)(2,4)
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
E (a|b)?.* b (0,1)(0,1)
E (a|b)c|a(b|c) ac (0,2)(0,1)
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
E (a|b)*c|(a|ab)*c xc (1,2)
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
E a?(ab|ba)ab abab (0,4)(0,2)
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
E ab|abab abbabab (0,2)
E aba|bab|bba baaabbbaba (5,8)
E aba|bab baaabbbaba (6,9)
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
E ab|a xabc (1,3)
E ab|a xxabc (2,4)
Ei (Ab|cD)* aBcD (0,4)(2,4)
BE [^-] --a (2,3)
BE [a-]* --a (0,3)
BE [a-m-]* --amoma-- (0,4)
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
{E [[:upper:]] A (0,1) [[<element>]] not supported
E [[:lower:]]+ `az{ (1,3)
E [[:upper:]]+ @AZ[ (1,3)
# No collation in Go
#BE [[-]] [[-]] (2,4)
#BE [[.NIL.]] NULL ECOLLATE
#BE [[=aleph=]] NULL ECOLLATE
}
BE$ \n \n (0,1)
BEn$ \n \n (0,1)
BE$ [^a] \n (0,1)
BE$ \na \na (0,2)
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
BE xxx xxx (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
BE$ .* \x01\x7f (0,2)
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
E a*a*a*a*a*b aaaaaaaaab (0,10)
BE ^ NULL (0,0)
BE $ NULL (0,0)
BE ^$ NULL (0,0)
BE ^a$ a (0,1)
BE abc abc (0,3)
BE abc xabcy (1,4)
BE abc ababc (2,5)
BE ab*c abc (0,3)
BE ab*bc abc (0,3)
BE ab*bc abbc (0,4)
BE ab*bc abbbbc (0,6)
E ab+bc abbc (0,4)
E ab+bc abbbbc (0,6)
E ab?bc abbc (0,4)
E ab?bc abc (0,3)
E ab?c abc (0,3)
BE ^abc$ abc (0,3)
BE ^abc abcc (0,3)
BE abc$ aabc (1,4)
BE ^ abc (0,0)
BE $ abc (3,3)
BE a.c abc (0,3)
BE a.c axc (0,3)
BE a.*c axyzc (0,5)
BE a[bc]d abd (0,3)
BE a[b-d]e ace (0,3)
BE a[b-d] aac (1,3)
BE a[-b] a- (0,2)
BE a[b-] a- (0,2)
BE a] a] (0,2)
BE a[]]b a]b (0,3)
BE a[^bc]d aed (0,3)
BE a[^-b]c adc (0,3)
BE a[^]b]c adc (0,3)
E ab|cd abc (0,2)
E ab|cd abcd (0,2)
E a\(b a(b (0,3)
E a\(*b ab (0,2)
E a\(*b a((b (0,4)
E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
#E (a*)* - (0,0)(0,0)
E (a*)* - (0,0)(?,?) RE2/Go
E (a*)+ - (0,0)(0,0)
#E (a*|b)* - (0,0)(0,0)
E (a*|b)* - (0,0)(?,?) RE2/Go
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
#E (^)* - (0,0)(0,0)
E (^)* - (0,0)(?,?) RE2/Go
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
#E ((a*|b))* - (0,0)(0,0)(0,0)
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
E (ab|cd)e abcde (2,5)(2,4)
BE [abhgefdc]ij hij (0,3)
E (a|b)c*d abcd (1,4)(1,2)
E (ab|ab*)bc abc (0,3)(0,1)
E a([bc]*)c* abc (0,3)(1,3)
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
E a[bcd]*dcdcde adcdcde (0,7)
E (ab|a)b*c abc (0,3)(0,2)
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
E ^a(bc+|b[eh])g|.h$ abh (1,3)
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
BE multiple words multiple words yeah (0,14)
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
BE abcd abcd (0,4)
E a(bc)d abcd (0,4)(1,3)
E a[-]?c ac (0,3)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
E a+(b|c)*d+ aabcdd (0,6)(3,4)
E ^.+$ vivi (0,4)
E ^(.+)$ vivi (0,4)(0,4)
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
E (foo|(bar))!bas foo!bas (0,7)(0,3)
E (foo|bar)!bas bar!bas (0,7)(0,3)
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
E (foo|bar)!bas foo!bas (0,7)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
E .*(/XXX).* /XXX (0,4)(0,4)
E .*(\\XXX).* \XXX (0,4)(0,4)
E \\XXX \XXX (0,4)
E .*(/000).* /000 (0,4)(0,4)
E .*(\\000).* \000 (0,4)(0,4)
E \\000 \000 (0,4)

79
src/libregex/testdata/nullsubexpr.dat vendored Normal file
View File

@ -0,0 +1,79 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)* a (0,1)(0,1)
E SAME x (0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)+ a (0,1)(0,1)
E SAME x NOMATCH
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
#E SAME b (0,0)(0,0)
E SAME b (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
E SAME aaaaaa (0,6)(0,6)
E SAME ababab (0,6)(0,6)
E SAME bababa (0,6)(0,6)
E SAME b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
#E SAME aaaaaa (0,0)(0,0)
E SAME aaaaaa (0,0)(?,?) RE2/Go
E ([^ab]*)* ccccxx (0,6)(0,6)
#E SAME ababab (0,0)(0,0)
E SAME ababab (0,0)(?,?) RE2/Go
E ((z)+|a)* zabcde (0,2)(1,2)
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
#E (a) aaa (0,1)(0,1)
#E (a*?) aaa (0,0)(0,0)
#E (a)*? aaa (0,0)
#E (a*?)*? aaa (0,0)
#}
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
#E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
E (a*)+(x) x (0,1)(0,0)(0,1)
E (a*)+(x) ax (0,2)(0,1)(1,2)
E (a*)+(x) axa (0,2)(0,1)(1,2)
E (a*){2}(x) x (0,1)(0,0)(0,1)
E (a*){2}(x) ax (0,2)(1,1)(1,2)
E (a*){2}(x) axa (0,2)(1,1)(1,2)

163
src/libregex/testdata/repetition.dat vendored Normal file
View File

@ -0,0 +1,163 @@
NOTE implicit vs. explicit repetitions : 2009-02-02
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
E ((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.)){1} NULL NOMATCH
E ((..)|(.)){2} NULL NOMATCH
E ((..)|(.)){3} NULL NOMATCH
E ((..)|(.))* NULL (0,0)
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)){2} a NOMATCH
E ((..)|(.)){3} a NOMATCH
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
E ((..)|(.)){3} aa NOMATCH
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
# Linux/GLIBC gets the {8,} and {8,8} wrong.
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
# These test a fixed bug in my regex-tdfa that did not keep the expanded
# form properly grouped, so right association did the wrong thing with
# these ambiguous patterns (crafted just to test my code when I became
# suspicious of my implementation). The first subexpression should use
# "ab" then "a" then "bcd".
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
# results like (0,6)(4,5)(6,6).
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
# The above worked on Linux/GLIBC but the following often fail.
# They also trip up OS X / FreeBSD / NetBSD:
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go

5537
src/libregex/unicode.rs Normal file

File diff suppressed because it is too large Load Diff

587
src/libregex/vm.rs Normal file
View File

@ -0,0 +1,587 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// FIXME: Currently, the VM simulates an NFA. It would be nice to have another
// VM that simulates a DFA.
//
// According to Russ Cox[1], a DFA performs better than an NFA, principally
// because it reuses states previously computed by the machine *and* doesn't
// keep track of capture groups. The drawback of a DFA (aside from its
// complexity) is that it can't accurately return the locations of submatches.
// The NFA *can* do that. (This is my understanding anyway.)
//
// Cox suggests that a DFA ought to be used to answer "does this match" and
// "where does it match" questions. (In the latter, the starting position of
// the match is computed by executing the regex backwards.) Cox also suggests
// that a DFA should be run when asking "where are the submatches", which can
// 1) quickly answer "no" is there's no match and 2) discover the substring
// that matches, which means running the NFA on smaller input.
//
// Currently, the NFA simulation implemented below does some dirty tricks to
// avoid tracking capture groups when they aren't needed (which only works
// for 'is_match', not 'find'). This is a half-measure, but does provide some
// perf improvement.
//
// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
//
// [1] - http://swtch.com/~rsc/regex/regex3.html
use std::cmp;
use std::mem;
use std::slice::MutableVector;
use compile::{
Program,
Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
Save, Jump, Split,
};
use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
use parse::unicode::PERLW;
pub type CaptureLocs = Vec<Option<uint>>;
/// Indicates the type of match to be performed by the VM.
pub enum MatchKind {
/// Only checks if a match exists or not. Does not return location.
Exists,
/// Returns the start and end indices of the entire match in the input
/// given.
Location,
/// Returns the start and end indices of each submatch in the input given.
Submatches,
}
/// Runs an NFA simulation on the compiled expression given on the search text
/// `input`. The search begins at byte index `start` and ends at byte index
/// `end`. (The range is specified here so that zero-width assertions will work
/// correctly when searching for successive non-overlapping matches.)
///
/// The `which` parameter indicates what kind of capture information the caller
/// wants. There are three choices: match existence only, the location of the
/// entire match or the locations of the entire match in addition to the
/// locations of each submatch.
pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
start: uint, end: uint) -> CaptureLocs {
Nfa {
which: which,
prog: prog,
input: input,
start: start,
end: end,
ic: 0,
chars: CharReader::new(input),
}.run()
}
struct Nfa<'r, 't> {
which: MatchKind,
prog: &'r Program,
input: &'t str,
start: uint,
end: uint,
ic: uint,
chars: CharReader<'t>,
}
/// Indicates the next action to take after a single non-empty instruction
/// is processed.
pub enum StepState {
/// This is returned if and only if a Match instruction is reached and
/// we only care about the existence of a match. It instructs the VM to
/// quit early.
StepMatchEarlyReturn,
/// Indicates that a match was found. Thus, the rest of the states in the
/// *current* queue should be dropped (i.e., leftmost-first semantics).
/// States in the "next" queue can still be processed.
StepMatch,
/// No match was found. Continue with the next state in the queue.
StepContinue,
}
impl<'r, 't> Nfa<'r, 't> {
fn run(&mut self) -> CaptureLocs {
let ncaps = match self.which {
Exists => 0,
Location => 1,
Submatches => self.prog.num_captures(),
};
let mut matched = false;
let ninsts = self.prog.insts.len();
let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
let mut groups = Vec::from_elem(ncaps * 2, None);
// Determine if the expression starts with a '^' so we can avoid
// simulating .*?
// Make sure multi-line mode isn't enabled for it, otherwise we can't
// drop the initial .*?
let prefix_anchor =
match *self.prog.insts.get(1) {
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
_ => false,
};
self.ic = self.start;
let mut next_ic = self.chars.set(self.start);
while self.ic <= self.end {
if clist.size == 0 {
// We have a match and we're done exploring alternatives.
// Time to quit.
if matched {
break
}
// If there are no threads to try, then we'll have to start
// over at the beginning of the regex.
// BUT, if there's a literal prefix for the program, try to
// jump ahead quickly. If it can't be found, then we can bail
// out early.
if self.prog.prefix.len() > 0 && clist.size == 0 {
let needle = self.prog.prefix.as_slice().as_bytes();
let haystack = self.input.as_bytes().slice_from(self.ic);
match find_prefix(needle, haystack) {
None => break,
Some(i) => {
self.ic += i;
next_ic = self.chars.set(self.ic);
}
}
}
}
// This simulates a preceding '.*?' for every regex by adding
// a state starting at the current position in the input for the
// beginning of the program only if we don't already have a match.
if clist.size == 0 || (!prefix_anchor && !matched) {
self.add(clist, 0, groups.as_mut_slice())
}
// Now we try to read the next character.
// As a result, the 'step' method will look at the previous
// character.
self.ic = next_ic;
next_ic = self.chars.advance();
let mut i = 0;
while i < clist.size {
let pc = clist.pc(i);
let step_state = self.step(groups.as_mut_slice(), nlist,
clist.groups(i), pc);
match step_state {
StepMatchEarlyReturn => return vec![Some(0), Some(0)],
StepMatch => { matched = true; clist.empty() },
StepContinue => {},
}
i += 1;
}
mem::swap(&mut clist, &mut nlist);
nlist.empty();
}
match self.which {
Exists if matched => vec![Some(0), Some(0)],
Exists => vec![None, None],
Location | Submatches => groups,
}
}
fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
caps: &mut [Option<uint>], pc: uint)
-> StepState {
match *self.prog.insts.get(pc) {
Match => {
match self.which {
Exists => {
return StepMatchEarlyReturn
}
Location => {
groups[0] = caps[0];
groups[1] = caps[1];
return StepMatch
}
Submatches => {
for (slot, val) in groups.mut_iter().zip(caps.iter()) {
*slot = *val;
}
return StepMatch
}
}
}
OneChar(c, flags) => {
if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
self.add(nlist, pc+1, caps);
}
}
CharClass(ref ranges, flags) => {
if self.chars.prev.is_some() {
let c = self.chars.prev.unwrap();
let negate = flags & FLAG_NEGATED > 0;
let casei = flags & FLAG_NOCASE > 0;
let found = ranges.as_slice();
let found = found.bsearch(|&rc| class_cmp(casei, c, rc));
let found = found.is_some();
if (found && !negate) || (!found && negate) {
self.add(nlist, pc+1, caps);
}
}
}
Any(flags) => {
if flags & FLAG_DOTNL > 0
|| !self.char_eq(false, self.chars.prev, '\n') {
self.add(nlist, pc+1, caps)
}
}
EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
| Save(_) | Jump(_) | Split(_, _) => {},
}
StepContinue
}
fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
if nlist.contains(pc) {
return
}
// We have to add states to the threads list even if their empty.
// TL;DR - It prevents cycles.
// If we didn't care about cycles, we'd *only* add threads that
// correspond to non-jumping instructions (OneChar, Any, Match, etc.).
// But, it's possible for valid regexs (like '(a*)*') to result in
// a cycle in the instruction list. e.g., We'll keep chasing the Split
// instructions forever.
// So we add these instructions to our thread queue, but in the main
// VM loop, we look for them but simply ignore them.
// Adding them to the queue prevents them from being revisited so we
// can avoid cycles (and the inevitable stack overflow).
//
// We make a minor optimization by indicating that the state is "empty"
// so that its capture groups are not filled in.
match *self.prog.insts.get(pc) {
EmptyBegin(flags) => {
let multi = flags & FLAG_MULTI > 0;
nlist.add(pc, groups, true);
if self.chars.is_begin()
|| (multi && self.char_is(self.chars.prev, '\n')) {
self.add(nlist, pc + 1, groups)
}
}
EmptyEnd(flags) => {
let multi = flags & FLAG_MULTI > 0;
nlist.add(pc, groups, true);
if self.chars.is_end()
|| (multi && self.char_is(self.chars.cur, '\n')) {
self.add(nlist, pc + 1, groups)
}
}
EmptyWordBoundary(flags) => {
nlist.add(pc, groups, true);
if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
self.add(nlist, pc + 1, groups)
}
}
Save(slot) => {
nlist.add(pc, groups, true);
match self.which {
Location if slot <= 1 => {
let old = groups[slot];
groups[slot] = Some(self.ic);
self.add(nlist, pc + 1, groups);
groups[slot] = old;
}
Submatches => {
let old = groups[slot];
groups[slot] = Some(self.ic);
self.add(nlist, pc + 1, groups);
groups[slot] = old;
}
Exists | Location => self.add(nlist, pc + 1, groups),
}
}
Jump(to) => {
nlist.add(pc, groups, true);
self.add(nlist, to, groups)
}
Split(x, y) => {
nlist.add(pc, groups, true);
self.add(nlist, x, groups);
self.add(nlist, y, groups);
}
Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
nlist.add(pc, groups, false);
}
}
}
// FIXME: For case insensitive comparisons, it uses the uppercase
// character and tests for equality. IIUC, this does not generalize to
// all of Unicode. I believe we need to check the entire fold for each
// character. This will be easy to add if and when it gets added to Rust's
// standard library.
#[inline]
fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
match textc {
None => false,
Some(textc) => {
regc == textc
|| (casei && regc.to_uppercase() == textc.to_uppercase())
}
}
}
#[inline]
fn char_is(&self, textc: Option<char>, regc: char) -> bool {
textc == Some(regc)
}
}
/// CharReader is responsible for maintaining a "previous" and a "current"
/// character. This one-character lookahead is necessary for assertions that
/// look one character before or after the current position.
pub struct CharReader<'t> {
/// The previous character read. It is None only when processing the first
/// character of the input.
pub prev: Option<char>,
/// The current character.
pub cur: Option<char>,
input: &'t str,
next: uint,
}
impl<'t> CharReader<'t> {
/// Returns a new CharReader that advances through the input given.
/// Note that a CharReader has no knowledge of the range in which to search
/// the input.
pub fn new(input: &'t str) -> CharReader<'t> {
CharReader {
prev: None,
cur: None,
input: input,
next: 0,
}
}
/// Sets the previous and current character given any arbitrary byte
/// index (at a unicode codepoint boundary).
#[inline]
pub fn set(&mut self, ic: uint) -> uint {
self.prev = None;
self.cur = None;
self.next = 0;
if self.input.len() == 0 {
return 1
}
if ic > 0 {
let i = cmp::min(ic, self.input.len());
let prev = self.input.char_range_at_reverse(i);
self.prev = Some(prev.ch);
}
if ic < self.input.len() {
let cur = self.input.char_range_at(ic);
self.cur = Some(cur.ch);
self.next = cur.next;
self.next
} else {
self.input.len() + 1
}
}
/// Does the same as `set`, except it always advances to the next
/// character in the input (and therefore does half as many UTF8 decodings).
#[inline]
pub fn advance(&mut self) -> uint {
self.prev = self.cur;
if self.next < self.input.len() {
let cur = self.input.char_range_at(self.next);
self.cur = Some(cur.ch);
self.next = cur.next;
} else {
self.cur = None;
self.next = self.input.len() + 1;
}
self.next
}
/// Returns true if and only if this is the beginning of the input
/// (ignoring the range of the input to search).
#[inline]
pub fn is_begin(&self) -> bool { self.prev.is_none() }
/// Returns true if and only if this is the end of the input
/// (ignoring the range of the input to search).
#[inline]
pub fn is_end(&self) -> bool { self.cur.is_none() }
/// Returns true if and only if the current position is a word boundary.
/// (Ignoring the range of the input to search.)
pub fn is_word_boundary(&self) -> bool {
if self.is_begin() {
return is_word(self.cur)
}
if self.is_end() {
return is_word(self.prev)
}
(is_word(self.cur) && !is_word(self.prev))
|| (is_word(self.prev) && !is_word(self.cur))
}
}
struct Thread {
pc: uint,
groups: Vec<Option<uint>>,
}
struct Threads {
which: MatchKind,
queue: Vec<Thread>,
sparse: Vec<uint>,
size: uint,
}
impl Threads {
// This is using a wicked neat trick to provide constant time lookup
// for threads in the queue using a sparse set. A queue of threads is
// allocated once with maximal size when the VM initializes and is reused
// throughout execution. That is, there should be zero allocation during
// the execution of a VM.
//
// See http://research.swtch.com/sparse for the deets.
fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
Threads {
which: which,
queue: Vec::from_fn(num_insts, |_| {
Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) }
}),
sparse: Vec::from_elem(num_insts, 0u),
size: 0,
}
}
fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
let t = self.queue.get_mut(self.size);
t.pc = pc;
match (empty, self.which) {
(_, Exists) | (true, _) => {},
(false, Location) => {
*t.groups.get_mut(0) = groups[0];
*t.groups.get_mut(1) = groups[1];
}
(false, Submatches) => {
for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
*slot = *val;
}
}
}
*self.sparse.get_mut(pc) = self.size;
self.size += 1;
}
#[inline]
fn contains(&self, pc: uint) -> bool {
let s = *self.sparse.get(pc);
s < self.size && self.queue.get(s).pc == pc
}
#[inline]
fn empty(&mut self) {
self.size = 0;
}
#[inline]
fn pc(&self, i: uint) -> uint {
self.queue.get(i).pc
}
#[inline]
fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
self.queue.get_mut(i).groups.as_mut_slice()
}
}
/// Returns true if the character is a word character, according to the
/// (Unicode friendly) Perl character class '\w'.
/// Note that this is only use for testing word boundaries. The actual '\w'
/// is encoded as a CharClass instruction.
pub fn is_word(c: Option<char>) -> bool {
let c = match c {
None => return false,
Some(c) => c,
};
// Try the common ASCII case before invoking binary search.
match c {
'_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true,
_ => PERLW.bsearch(|&(start, end)| {
if c >= start && c <= end {
Equal
} else if start > c {
Greater
} else {
Less
}
}).is_some()
}
}
/// Given a character and a single character class range, return an ordering
/// indicating whether the character is less than the start of the range,
/// in the range (inclusive) or greater than the end of the range.
///
/// If `casei` is `true`, then this ordering is computed case insensitively.
///
/// This function is meant to be used with a binary search.
#[inline]
fn class_cmp(casei: bool, mut textc: char,
(mut start, mut end): (char, char)) -> Ordering {
if casei {
// FIXME: This is pretty ridiculous. All of this case conversion
// can be moved outside this function:
// 1) textc should be uppercased outside the bsearch.
// 2) the character class itself should be uppercased either in the
// parser or the compiler.
// FIXME: This is too simplistic for correct Unicode support.
// See also: char_eq
textc = textc.to_uppercase();
start = start.to_uppercase();
end = end.to_uppercase();
}
if textc >= start && textc <= end {
Equal
} else if start > textc {
Greater
} else {
Less
}
}
/// Returns the starting location of `needle` in `haystack`.
/// If `needle` is not in `haystack`, then `None` is returned.
///
/// Note that this is using a naive substring algorithm.
#[inline]
pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
let (hlen, nlen) = (haystack.len(), needle.len());
if nlen > hlen || nlen == 0 {
return None
}
let mut hayi = 0u;
'HAYSTACK: loop {
if hayi > hlen - nlen {
break
}
let mut nedi = 0;
while nedi < nlen {
if haystack[hayi+nedi] != needle[nedi] {
hayi += 1;
continue 'HAYSTACK
}
nedi += 1;
}
return Some(hayi)
}
None
}

684
src/libregex_macros/lib.rs Normal file
View File

@ -0,0 +1,684 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! This crate provides the `regex!` macro. Its use is documented in the
//! `regex` crate.
#![crate_id = "regex_macros#0.11-pre"]
#![crate_type = "dylib"]
#![experimental]
#![license = "MIT/ASL2"]
#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
html_favicon_url = "http://www.rust-lang.org/favicon.ico",
html_root_url = "http://static.rust-lang.org/doc/master")]
#![feature(macro_registrar, managed_boxes, quote)]
extern crate regex;
extern crate syntax;
use syntax::ast;
use syntax::codemap;
use syntax::ext::base::{
SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult,
NormalTT, BasicMacroExpander,
};
use syntax::parse;
use syntax::parse::token;
use syntax::print::pprust;
use regex::Regex;
use regex::native::{
OneChar, CharClass, Any, Save, Jump, Split,
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
Program, Dynamic, Native,
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
};
/// For the `regex!` syntax extension. Do not use.
#[macro_registrar]
#[doc(hidden)]
pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) {
let expander = ~BasicMacroExpander { expander: native, span: None };
register(token::intern("regex"), NormalTT(expander, None))
}
/// Generates specialized code for the Pike VM for a particular regular
/// expression.
///
/// There are two primary differences between the code generated here and the
/// general code in vm.rs.
///
/// 1. All heap allocation is removed. Sized vector types are used instead.
/// Care must be taken to make sure that these vectors are not copied
/// gratuitously. (If you're not sure, run the benchmarks. They will yell
/// at you if you do.)
/// 2. The main `match instruction { ... }` expressions are replaced with more
/// direct `match pc { ... }`. The generators can be found in
/// `step_insts` and `add_insts`.
///
/// Other more minor changes include eliding code when possible (although this
/// isn't completely thorough at the moment), and translating character class
/// matching from using a binary search to a simple `match` expression (see
/// `match_class`).
///
/// It is strongly recommended to read the dynamic implementation in vm.rs
/// first before trying to understand the code generator. The implementation
/// strategy is identical and vm.rs has comments and will be easier to follow.
fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
-> ~MacResult {
let regex = match parse(cx, tts) {
Some(r) => r,
// error is logged in 'parse' with cx.span_err
None => return DummyResult::any(sp),
};
let re = match Regex::new(regex.to_owned()) {
Ok(re) => re,
Err(err) => {
cx.span_err(sp, err.to_str());
return DummyResult::any(sp)
}
};
let prog = match re.p {
Dynamic(ref prog) => prog.clone(),
Native(_) => unreachable!(),
};
let mut gen = NfaGen {
cx: &*cx, sp: sp, prog: prog,
names: re.names.clone(), original: re.original.clone(),
};
MacExpr::new(gen.code())
}
struct NfaGen<'a> {
cx: &'a ExtCtxt<'a>,
sp: codemap::Span,
prog: Program,
names: ~[Option<~str>],
original: ~str,
}
impl<'a> NfaGen<'a> {
fn code(&mut self) -> @ast::Expr {
// Most or all of the following things are used in the quasiquoted
// expression returned.
let num_cap_locs = 2 * self.prog.num_captures();
let num_insts = self.prog.insts.len();
let cap_names = self.vec_expr(self.names,
|cx, name| match name {
&Some(ref name) => {
let name = name.as_slice();
quote_expr!(cx, Some(~$name))
}
&None => quote_expr!(cx, None),
}
);
let prefix_anchor =
match self.prog.insts.as_slice()[1] {
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
_ => false,
};
let init_groups = self.vec_from_fn(num_cap_locs,
|cx| quote_expr!(cx, None));
let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(),
|cx, b| quote_expr!(cx, $b));
let check_prefix = self.check_prefix();
let step_insts = self.step_insts();
let add_insts = self.add_insts();
let regex = self.original.as_slice();
quote_expr!(self.cx, {
fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
start: uint, end: uint) -> Vec<Option<uint>> {
#![allow(unused_imports)]
use regex::native::{
MatchKind, Exists, Location, Submatches,
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
CharReader, find_prefix,
};
return Nfa {
which: which,
input: input,
ic: 0,
chars: CharReader::new(input),
}.run(start, end);
type Captures = [Option<uint>, ..$num_cap_locs];
struct Nfa<'t> {
which: MatchKind,
input: &'t str,
ic: uint,
chars: CharReader<'t>,
}
impl<'t> Nfa<'t> {
#[allow(unused_variable)]
fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
let mut matched = false;
let prefix_bytes: &[u8] = &$prefix_bytes;
let mut clist = &mut Threads::new(self.which);
let mut nlist = &mut Threads::new(self.which);
let mut groups = $init_groups;
self.ic = start;
let mut next_ic = self.chars.set(start);
while self.ic <= end {
if clist.size == 0 {
if matched {
break
}
$check_prefix
}
if clist.size == 0 || (!$prefix_anchor && !matched) {
self.add(clist, 0, &mut groups)
}
self.ic = next_ic;
next_ic = self.chars.advance();
let mut i = 0;
while i < clist.size {
let pc = clist.pc(i);
let step_state = self.step(&mut groups, nlist,
clist.groups(i), pc);
match step_state {
StepMatchEarlyReturn =>
return vec![Some(0u), Some(0u)],
StepMatch => { matched = true; clist.empty() },
StepContinue => {},
}
i += 1;
}
::std::mem::swap(&mut clist, &mut nlist);
nlist.empty();
}
match self.which {
Exists if matched => vec![Some(0u), Some(0u)],
Exists => vec![None, None],
Location | Submatches => groups.iter().map(|x| *x).collect(),
}
}
// Sometimes `nlist` is never used (for empty regexes).
#[allow(unused_variable)]
#[inline]
fn step(&self, groups: &mut Captures, nlist: &mut Threads,
caps: &mut Captures, pc: uint) -> StepState {
$step_insts
StepContinue
}
fn add(&self, nlist: &mut Threads, pc: uint,
groups: &mut Captures) {
if nlist.contains(pc) {
return
}
$add_insts
}
}
struct Thread {
pc: uint,
groups: Captures,
}
struct Threads {
which: MatchKind,
queue: [Thread, ..$num_insts],
sparse: [uint, ..$num_insts],
size: uint,
}
impl Threads {
fn new(which: MatchKind) -> Threads {
Threads {
which: which,
// These unsafe blocks are used for performance reasons, as it
// gives us a zero-cost initialization of a sparse set. The
// trick is described in more detail here:
// http://research.swtch.com/sparse
// The idea here is to avoid initializing threads that never
// need to be initialized, particularly for larger regexs with
// a lot of instructions.
queue: unsafe { ::std::mem::uninit() },
sparse: unsafe { ::std::mem::uninit() },
size: 0,
}
}
#[inline]
fn add(&mut self, pc: uint, groups: &Captures) {
let t = &mut self.queue[self.size];
t.pc = pc;
match self.which {
Exists => {},
Location => {
t.groups[0] = groups[0];
t.groups[1] = groups[1];
}
Submatches => {
for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
*slot = *val;
}
}
}
self.sparse[pc] = self.size;
self.size += 1;
}
#[inline]
fn add_empty(&mut self, pc: uint) {
self.queue[self.size].pc = pc;
self.sparse[pc] = self.size;
self.size += 1;
}
#[inline]
fn contains(&self, pc: uint) -> bool {
let s = self.sparse[pc];
s < self.size && self.queue[s].pc == pc
}
#[inline]
fn empty(&mut self) {
self.size = 0;
}
#[inline]
fn pc(&self, i: uint) -> uint {
self.queue[i].pc
}
#[inline]
fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
&'r mut self.queue[i].groups
}
}
}
::regex::Regex {
original: ~$regex,
names: ~$cap_names,
p: ::regex::native::Native(exec),
}
})
}
// Generates code for the `add` method, which is responsible for adding
// zero-width states to the next queue of states to visit.
fn add_insts(&self) -> @ast::Expr {
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let nextpc = pc + 1;
let body = match *inst {
EmptyBegin(flags) => {
let nl = '\n';
let cond =
if flags & FLAG_MULTI > 0 {
quote_expr!(self.cx,
self.chars.is_begin()
|| self.chars.prev == Some($nl)
)
} else {
quote_expr!(self.cx, self.chars.is_begin())
};
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
})
}
EmptyEnd(flags) => {
let nl = '\n';
let cond =
if flags & FLAG_MULTI > 0 {
quote_expr!(self.cx,
self.chars.is_end()
|| self.chars.cur == Some($nl)
)
} else {
quote_expr!(self.cx, self.chars.is_end())
};
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
})
}
EmptyWordBoundary(flags) => {
let cond =
if flags & FLAG_NEGATED > 0 {
quote_expr!(self.cx, !self.chars.is_word_boundary())
} else {
quote_expr!(self.cx, self.chars.is_word_boundary())
};
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
})
}
Save(slot) => {
let save = quote_expr!(self.cx, {
let old = groups[$slot];
groups[$slot] = Some(self.ic);
self.add(nlist, $nextpc, &mut *groups);
groups[$slot] = old;
});
let add = quote_expr!(self.cx, {
self.add(nlist, $nextpc, &mut *groups);
});
// If this is saving a submatch location but we request
// existence or only full match location, then we can skip
// right over it every time.
if slot > 1 {
quote_expr!(self.cx, {
nlist.add_empty($pc);
match self.which {
Submatches => $save,
Exists | Location => $add,
}
})
} else {
quote_expr!(self.cx, {
nlist.add_empty($pc);
match self.which {
Submatches | Location => $save,
Exists => $add,
}
})
}
}
Jump(to) => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
self.add(nlist, $to, &mut *groups);
})
}
Split(x, y) => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
self.add(nlist, $x, &mut *groups);
self.add(nlist, $y, &mut *groups);
})
}
// For Match, OneChar, CharClass, Any
_ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
};
self.arm_inst(pc, body)
}).collect::<Vec<ast::Arm>>();
self.match_insts(arms)
}
// Generates the code for the `step` method, which processes all states
// in the current queue that consume a single character.
fn step_insts(&self) -> @ast::Expr {
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let nextpc = pc + 1;
let body = match *inst {
Match => {
quote_expr!(self.cx, {
match self.which {
Exists => {
return StepMatchEarlyReturn
}
Location => {
groups[0] = caps[0];
groups[1] = caps[1];
return StepMatch
}
Submatches => {
for (slot, val) in groups.mut_iter().zip(caps.iter()) {
*slot = *val;
}
return StepMatch
}
}
})
}
OneChar(c, flags) => {
if flags & FLAG_NOCASE > 0 {
let upc = c.to_uppercase();
quote_expr!(self.cx, {
let upc = self.chars.prev.map(|c| c.to_uppercase());
if upc == Some($upc) {
self.add(nlist, $nextpc, caps);
}
})
} else {
quote_expr!(self.cx, {
if self.chars.prev == Some($c) {
self.add(nlist, $nextpc, caps);
}
})
}
}
CharClass(ref ranges, flags) => {
let negate = flags & FLAG_NEGATED > 0;
let casei = flags & FLAG_NOCASE > 0;
let get_char =
if casei {
quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
} else {
quote_expr!(self.cx, self.chars.prev.unwrap())
};
let negcond =
if negate {
quote_expr!(self.cx, !found)
} else {
quote_expr!(self.cx, found)
};
let mranges = self.match_class(casei, ranges.as_slice());
quote_expr!(self.cx, {
if self.chars.prev.is_some() {
let c = $get_char;
let found = $mranges;
if $negcond {
self.add(nlist, $nextpc, caps);
}
}
})
}
Any(flags) => {
if flags & FLAG_DOTNL > 0 {
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
} else {
let nl = '\n'; // no char lits allowed? wtf?
quote_expr!(self.cx, {
if self.chars.prev != Some($nl) {
self.add(nlist, $nextpc, caps)
}
})
}
}
// EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
_ => quote_expr!(self.cx, {}),
};
self.arm_inst(pc, body)
}).collect::<Vec<ast::Arm>>();
self.match_insts(arms)
}
// Translates a character class into a match expression.
// This avoids a binary search (and is hopefully replaced by a jump
// table).
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr {
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
if casei {
start = start.to_uppercase();
end = end.to_uppercase();
}
ast::Arm {
attrs: vec!(),
pats: vec!(@ast::Pat{
id: ast::DUMMY_NODE_ID,
span: self.sp,
node: ast::PatRange(quote_expr!(self.cx, $start),
quote_expr!(self.cx, $end)),
}),
guard: None,
body: quote_expr!(self.cx, true),
}
}).collect::<Vec<ast::Arm>>();
arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
let match_on = quote_expr!(self.cx, c);
self.dummy_expr(ast::ExprMatch(match_on, arms))
}
// Generates code for checking a literal prefix of the search string.
// The code is only generated if the regex *has* a literal prefix.
// Otherwise, a no-op is returned.
fn check_prefix(&self) -> @ast::Expr {
if self.prog.prefix.len() == 0 {
quote_expr!(self.cx, {})
} else {
quote_expr!(self.cx,
if clist.size == 0 {
let haystack = self.input.as_bytes().slice_from(self.ic);
match find_prefix(prefix_bytes, haystack) {
None => break,
Some(i) => {
self.ic += i;
next_ic = self.chars.set(self.ic);
}
}
}
)
}
}
// Builds a `match pc { ... }` expression from a list of arms, specifically
// for matching the current program counter with an instruction.
// A wild-card arm is automatically added that executes a no-op. It will
// never be used, but is added to satisfy the compiler complaining about
// non-exhaustive patterns.
fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr {
let mat_pc = quote_expr!(self.cx, pc);
arms.push(self.wild_arm_expr(quote_expr!(self.cx, {})));
self.dummy_expr(ast::ExprMatch(mat_pc, arms))
}
// Creates a match arm for the instruction at `pc` with the expression
// `body`.
fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm {
ast::Arm {
attrs: vec!(),
pats: vec!(@ast::Pat{
id: ast::DUMMY_NODE_ID,
span: self.sp,
node: ast::PatLit(quote_expr!(self.cx, $pc)),
}),
guard: None,
body: body,
}
}
// Creates a wild-card match arm with the expression `body`.
fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm {
ast::Arm {
attrs: vec!(),
pats: vec!(@ast::Pat{
id: ast::DUMMY_NODE_ID,
span: self.sp,
node: ast::PatWild,
}),
guard: None,
body: body,
}
}
// Builds a `[a, b, .., len]` expression where each element is the result
// of executing `to_expr`.
fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr)
-> @ast::Expr {
self.vec_expr(Vec::from_elem(len, ()).as_slice(),
|cx, _| to_expr(cx))
}
// Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
// on each element in `xs`.
fn vec_expr<T>(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr)
-> @ast::Expr {
let mut exprs = vec!();
for x in xs.iter() {
exprs.push(to_expr(self.cx, x))
}
let vec_exprs = self.dummy_expr(ast::ExprVec(exprs));
quote_expr!(self.cx, $vec_exprs)
}
// Creates an expression with a dummy node ID given an underlying
// `ast::Expr_`.
fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr {
@ast::Expr {
id: ast::DUMMY_NODE_ID,
node: e,
span: self.sp,
}
}
}
// This trait is defined in the quote module in the syntax crate, but I
// don't think it's exported.
// Interestingly, quote_expr! only requires that a 'to_tokens' method be
// defined rather than satisfying a particular trait.
#[doc(hidden)]
trait ToTokens {
fn to_tokens(&self, cx: &ExtCtxt) -> Vec<ast::TokenTree>;
}
impl ToTokens for char {
fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32)))
}
}
impl ToTokens for bool {
fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
let ident = token::IDENT(token::str_to_ident(self.to_str()), false);
vec!(ast::TTTok(codemap::DUMMY_SP, ident))
}
}
/// Looks for a single string literal and returns it.
/// Otherwise, logs an error with cx.span_err and returns None.
fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> {
let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(),
Vec::from_slice(tts));
let entry = cx.expand_expr(parser.parse_expr());
let regex = match entry.node {
ast::ExprLit(lit) => {
match lit.node {
ast::LitStr(ref s, _) => s.to_str(),
_ => {
cx.span_err(entry.span, format!(
"expected string literal but got `{}`",
pprust::lit_to_str(lit)));
return None
}
}
}
_ => {
cx.span_err(entry.span, format!(
"expected string literal but got `{}`",
pprust::expr_to_str(entry)));
return None
}
};
if !parser.eat(&token::EOF) {
cx.span_err(parser.span, "only one string literal allowed");
return None;
}
Some(regex)
}

View File

@ -0,0 +1,94 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// ignore-stage1
// ignore-cross-compile #12102
#![feature(macro_rules, phase)]
extern crate regex;
#[phase(syntax)]extern crate regex_macros;
extern crate sync;
use std::io;
use regex::{NoExpand, Regex};
use sync::Arc;
fn count_matches(seq: &str, variant: &Regex) -> int {
let mut n = 0;
for _ in variant.find_iter(seq) {
n += 1;
}
n
}
fn main() {
let mut rdr = if std::os::getenv("RUST_BENCH").is_some() {
let fd = io::File::open(&Path::new("shootout-k-nucleotide.data"));
~io::BufferedReader::new(fd) as ~io::Reader
} else {
~io::stdin() as ~io::Reader
};
let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap());
let ilen = seq.len();
seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand(""));
let seq_arc = Arc::new(seq.clone()); // copy before it moves
let clen = seq.len();
let mut seqlen = sync::Future::spawn(proc() {
let substs = ~[
(regex!("B"), "(c|g|t)"),
(regex!("D"), "(a|g|t)"),
(regex!("H"), "(a|c|t)"),
(regex!("K"), "(g|t)"),
(regex!("M"), "(a|c)"),
(regex!("N"), "(a|c|g|t)"),
(regex!("R"), "(a|g)"),
(regex!("S"), "(c|g)"),
(regex!("V"), "(a|c|g)"),
(regex!("W"), "(a|t)"),
(regex!("Y"), "(c|t)"),
];
let mut seq = seq;
for (re, replacement) in substs.move_iter() {
seq = re.replace_all(seq.as_slice(), NoExpand(replacement));
}
seq.len()
});
let variants = ~[
regex!("agggtaaa|tttaccct"),
regex!("[cgt]gggtaaa|tttaccc[acg]"),
regex!("a[act]ggtaaa|tttacc[agt]t"),
regex!("ag[act]gtaaa|tttac[agt]ct"),
regex!("agg[act]taaa|ttta[agt]cct"),
regex!("aggg[acg]aaa|ttt[cgt]ccct"),
regex!("agggt[cgt]aa|tt[acg]accct"),
regex!("agggta[cgt]a|t[acg]taccct"),
regex!("agggtaa[cgt]|[acg]ttaccct"),
];
let (mut variant_strs, mut counts) = (vec!(), vec!());
for variant in variants.move_iter() {
let seq_arc_copy = seq_arc.clone();
variant_strs.push(variant.to_str().to_owned());
counts.push(sync::Future::spawn(proc() {
count_matches(seq_arc_copy.as_slice(), &variant)
}));
}
for (i, variant) in variant_strs.iter().enumerate() {
println!("{} {}", variant, counts.get_mut(i).get());
}
println!("");
println!("{}", ilen);
println!("{}", clen);
println!("{}", seqlen.get());
}

View File

@ -0,0 +1,26 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// ignore-stage1
#![feature(phase)]
extern crate regex;
#[phase(syntax)] extern crate regex_macros;
// Tests to make sure that `regex!` will produce a compile error when given
// an invalid regular expression.
// More exhaustive failure tests for the parser are done with the traditional
// unit testing infrastructure, since both dynamic and native regexes use the
// same parser.
fn main() {
let _ = regex!("("); //~ ERROR Regex syntax error
}