1253: Share literal validation logic with compiler r=matklad a=matklad

This is neat: the unescape module is literary what compiler is using right now:

c6ac575648/src/libsyntax/parse/unescape.rs

So, yeah, code sharing via copy-paste!



Co-authored-by: Aleksey Kladov <aleksey.kladov@gmail.com>
This commit is contained in:
bors[bot] 2019-05-07 16:43:10 +00:00
commit d3efedb752
10 changed files with 620 additions and 1201 deletions

View File

@ -23,7 +23,6 @@ mod syntax_node;
mod syntax_text;
mod syntax_error;
mod parsing;
mod string_lexing;
mod validation;
mod ptr;

View File

@ -1,333 +0,0 @@
use crate::{TextRange, TextUnit};
use self::StringComponentKind::*;
#[derive(Debug, Eq, PartialEq, Clone)]
pub(crate) struct StringComponent {
pub(crate) range: TextRange,
pub(crate) kind: StringComponentKind,
}
#[derive(Debug, Eq, PartialEq, Clone)]
pub(crate) enum StringComponentKind {
IgnoreNewline,
CodePoint,
AsciiEscape,
AsciiCodeEscape,
UnicodeEscape,
}
pub(crate) fn parse_quoted_literal(
prefix: Option<char>,
quote: char,
src: &str,
) -> StringComponentIter {
let prefix = prefix.map(|p| match p {
'b' => b'b',
_ => panic!("invalid prefix"),
});
let quote = match quote {
'\'' => b'\'',
'"' => b'"',
_ => panic!("invalid quote"),
};
StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None }
}
pub(crate) struct StringComponentIter<'a> {
src: &'a str,
prefix: Option<u8>,
quote: u8,
pos: usize,
pub(crate) has_closing_quote: bool,
pub(crate) suffix: Option<TextRange>,
}
impl<'a> Iterator for StringComponentIter<'a> {
type Item = StringComponent;
fn next(&mut self) -> Option<StringComponent> {
if self.pos == 0 {
if let Some(prefix) = self.prefix {
assert!(
self.advance() == prefix as char,
"literal should start with a {:?}",
prefix as char,
);
}
assert!(
self.advance() == self.quote as char,
"literal should start with a {:?}",
self.quote as char,
);
}
if let Some(component) = self.parse_component() {
return Some(component);
}
// We get here when there are no char components left to parse
if self.peek() == Some(self.quote as char) {
self.advance();
self.has_closing_quote = true;
if let Some(range) = self.parse_suffix() {
self.suffix = Some(range);
}
}
assert!(
self.peek() == None,
"literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
self.src,
self.pos,
self.src.len()
);
None
}
}
impl<'a> StringComponentIter<'a> {
fn peek(&self) -> Option<char> {
if self.pos == self.src.len() {
return None;
}
self.src[self.pos..].chars().next()
}
fn advance(&mut self) -> char {
let next = self.peek().expect("cannot advance if end of input is reached");
self.pos += next.len_utf8();
next
}
fn parse_component(&mut self) -> Option<StringComponent> {
let next = self.peek()?;
// Ignore string close
if next == self.quote as char {
return None;
}
let start = self.start_range();
self.advance();
if next == '\\' {
// Strings can use `\` to ignore newlines, so we first try to parse one of those
// before falling back to parsing char escapes
if self.quote == b'"' {
if let Some(component) = self.parse_ignore_newline(start) {
return Some(component);
}
}
Some(self.parse_escape(start))
} else {
Some(self.finish_component(start, CodePoint))
}
}
fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
// In string literals, when a `\` occurs immediately before the newline, the `\`,
// the newline, and all whitespace at the beginning of the next line are ignored
match self.peek() {
Some('\n') | Some('\r') => {
self.skip_whitespace();
Some(self.finish_component(start, IgnoreNewline))
}
_ => None,
}
}
fn skip_whitespace(&mut self) {
while self.peek().map(|c| c.is_whitespace()) == Some(true) {
self.advance();
}
}
fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
if self.peek().is_none() {
return self.finish_component(start, AsciiEscape);
}
let next = self.advance();
match next {
'x' => self.parse_ascii_code_escape(start),
'u' => self.parse_unicode_escape(start),
_ => self.finish_component(start, AsciiEscape),
}
}
fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
match self.peek() {
Some('{') => {
self.advance();
// Parse anything until we reach `}`
while let Some(next) = self.peek() {
self.advance();
if next == '}' {
break;
}
}
self.finish_component(start, UnicodeEscape)
}
Some(_) | None => self.finish_component(start, UnicodeEscape),
}
}
fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
let code_start = self.pos;
while let Some(next) = self.peek() {
if next == '\'' || (self.pos - code_start == 2) {
break;
}
self.advance();
}
self.finish_component(start, AsciiCodeEscape)
}
fn parse_suffix(&mut self) -> Option<TextRange> {
let start = self.start_range();
let _ = self.peek()?;
while let Some(_) = self.peek() {
self.advance();
}
Some(self.finish_range(start))
}
fn start_range(&self) -> TextUnit {
TextUnit::from_usize(self.pos)
}
fn finish_range(&self, start: TextUnit) -> TextRange {
TextRange::from_to(start, TextUnit::from_usize(self.pos))
}
fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent {
let range = self.finish_range(start);
StringComponent { range, kind }
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(src: &str) -> (bool, Vec<StringComponent>) {
let component_iterator = &mut parse_quoted_literal(None, '\'', src);
let components: Vec<_> = component_iterator.collect();
(component_iterator.has_closing_quote, components)
}
fn unclosed_char_component(src: &str) -> StringComponent {
let (has_closing_quote, components) = parse(src);
assert!(!has_closing_quote, "char should not have closing quote");
assert!(components.len() == 1);
components[0].clone()
}
fn closed_char_component(src: &str) -> StringComponent {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
components[0].clone()
}
fn closed_char_components(src: &str) -> Vec<StringComponent> {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
components
}
fn range_closed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
}
fn range_unclosed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32).into())
}
#[test]
fn test_unicode_escapes() {
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = closed_char_component(&escape_sequence);
let expected_range = range_closed(&escape_sequence);
assert_eq!(component.kind, UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}
#[test]
fn test_unicode_escapes_unclosed() {
let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = unclosed_char_component(&escape_sequence);
let expected_range = range_unclosed(&escape_sequence);
assert_eq!(component.kind, UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}
#[test]
fn test_empty_char() {
let (has_closing_quote, components) = parse("''");
assert!(has_closing_quote, "char should have closing quote");
assert!(components.len() == 0);
}
#[test]
fn test_unclosed_char() {
let component = unclosed_char_component("'a");
assert!(component.kind == CodePoint);
assert!(component.range == TextRange::from_to(1.into(), 2.into()));
}
#[test]
fn test_digit_escapes() {
let literals = &[r"", r"5", r"55"];
for literal in literals {
let lit_text = format!(r"'\x{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == AsciiCodeEscape);
assert!(component.range == range_closed(&lit_text));
}
// More than 2 digits starts a new codepoint
let components = closed_char_components(r"'\x555'");
assert!(components.len() == 2);
assert!(components[1].kind == CodePoint);
}
#[test]
fn test_ascii_escapes() {
let literals = &[
r"\'", "\\\"", // equivalent to \"
r"\n", r"\r", r"\t", r"\\", r"\0",
];
for literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == AsciiEscape);
assert!(component.range == range_closed(&lit_text));
}
}
#[test]
fn test_no_escapes() {
let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
for &literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == CodePoint);
assert!(component.range == range_closed(&lit_text));
}
}
}

View File

@ -2,7 +2,10 @@ use std::fmt;
use ra_parser::ParseError;
use crate::{TextRange, TextUnit};
use crate::{
TextRange, TextUnit,
validation::EscapeError,
};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SyntaxError {
@ -67,32 +70,7 @@ impl fmt::Display for SyntaxError {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum SyntaxErrorKind {
ParseError(ParseError),
UnescapedCodepoint,
EmptyChar,
UnclosedChar,
OverlongChar,
EmptyByte,
UnclosedByte,
OverlongByte,
ByteOutOfRange,
UnescapedByte,
EmptyByteEscape,
InvalidByteEscape,
TooShortByteCodeEscape,
MalformedByteCodeEscape,
UnicodeEscapeForbidden,
EmptyAsciiEscape,
InvalidAsciiEscape,
TooShortAsciiCodeEscape,
AsciiCodeEscapeOutOfRange,
MalformedAsciiCodeEscape,
UnclosedUnicodeEscape,
MalformedUnicodeEscape,
EmptyUnicodeEcape,
OverlongUnicodeEscape,
UnicodeEscapeOutOfRange,
UnclosedString,
InvalidSuffix,
EscapeError(EscapeError),
InvalidBlockAttr,
InvalidMatchInnerAttr,
InvalidTupleIndexFormat,
@ -102,38 +80,6 @@ impl fmt::Display for SyntaxErrorKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::SyntaxErrorKind::*;
match self {
UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
EmptyAsciiEscape => write!(f, "Empty escape sequence"),
InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
EmptyChar => write!(f, "Empty char literal"),
UnclosedChar => write!(f, "Unclosed char literal"),
OverlongChar => write!(f, "Char literal should be one character long"),
EmptyByte => write!(f, "Empty byte literal"),
UnclosedByte => write!(f, "Unclosed byte literal"),
OverlongByte => write!(f, "Byte literal should be one character long"),
ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"),
UnescapedByte => write!(f, "This byte should always be escaped"),
EmptyByteEscape => write!(f, "Empty escape sequence"),
InvalidByteEscape => write!(f, "Invalid escape sequence"),
TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"),
MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
UnicodeEscapeForbidden => {
write!(f, "Unicode escapes are not allowed in byte literals or byte strings")
}
TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
AsciiCodeEscapeOutOfRange => {
write!(f, "Escape sequence should be between \\x00 and \\x7F")
}
MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
OverlongUnicodeEscape => {
write!(f, "Unicode escape sequence should have at most 6 digits")
}
UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
UnclosedString => write!(f, "Unclosed string literal"),
InvalidSuffix => write!(f, "Invalid literal suffix"),
InvalidBlockAttr => {
write!(f, "A block in this position cannot accept inner attributes")
}
@ -144,6 +90,46 @@ impl fmt::Display for SyntaxErrorKind {
write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix")
}
ParseError(msg) => write!(f, "{}", msg.0),
EscapeError(err) => write!(f, "{}", err),
}
}
}
impl fmt::Display for EscapeError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let msg = match self {
EscapeError::ZeroChars => "Empty literal",
EscapeError::MoreThanOneChar => "Literal should be one character long",
EscapeError::LoneSlash => "Character must be escaped: '\\'",
EscapeError::InvalidEscape => "Invalid escape sequence",
EscapeError::BareCarriageReturn => "Character must be escaped: '\r'",
EscapeError::EscapeOnlyChar => "Character must be escaped",
EscapeError::TooShortHexEscape => "Escape sequence should have two digits",
EscapeError::InvalidCharInHexEscape => "Escape sequence should be a hexadecimal number",
EscapeError::OutOfRangeHexEscape => "Escape sequence should be ASCII",
EscapeError::NoBraceInUnicodeEscape => "Invalid escape sequence",
EscapeError::InvalidCharInUnicodeEscape => "Invalid escape sequence",
EscapeError::EmptyUnicodeEscape => "Invalid escape sequence",
EscapeError::UnclosedUnicodeEscape => "Missing '}'",
EscapeError::LeadingUnderscoreUnicodeEscape => "Invalid escape sequence",
EscapeError::OverlongUnicodeEscape => {
"Unicode escape sequence should have at most 6 digits"
}
EscapeError::LoneSurrogateUnicodeEscape => {
"Unicode escape code should not be a surrogate"
}
EscapeError::OutOfRangeUnicodeEscape => {
"Unicode escape code should be at most 0x10FFFF"
}
EscapeError::UnicodeEscapeInByte => "Unicode escapes are not allowed in bytes",
EscapeError::NonAsciiCharInByte => "Non ASCII characters are not allowed in bytes",
};
write!(f, "{}", msg)
}
}
impl From<EscapeError> for SyntaxErrorKind {
fn from(err: EscapeError) -> Self {
SyntaxErrorKind::EscapeError(err)
}
}

View File

@ -1,17 +1,17 @@
mod byte;
mod byte_string;
mod char;
mod string;
mod unescape;
mod block;
mod field_expr;
use crate::{
SourceFile, SyntaxError, AstNode, SyntaxNode,
SourceFile, SyntaxError, AstNode, SyntaxNode, TextUnit,
SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR},
ast,
algo::visit::{visitor_ctx, VisitorCtx},
};
pub(crate) use unescape::EscapeError;
pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
let mut errors = Vec::new();
for node in file.syntax().descendants() {
@ -26,11 +26,55 @@ pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
// FIXME: kill duplication
fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) {
match literal.token().kind() {
BYTE => byte::validate_byte_node(literal.token(), acc),
BYTE_STRING => byte_string::validate_byte_string_node(literal.token(), acc),
STRING => string::validate_string_node(literal.token(), acc),
CHAR => char::validate_char_node(literal.token(), acc),
let token = literal.token();
let text = token.text().as_str();
match token.kind() {
BYTE => {
if let Some(end) = text.rfind('\'') {
if let Some(without_quotes) = text.get(2..end) {
if let Err((off, err)) = unescape::unescape_byte(without_quotes) {
let off = token.range().start() + TextUnit::from_usize(off + 2);
acc.push(SyntaxError::new(err.into(), off))
}
}
}
}
CHAR => {
if let Some(end) = text.rfind('\'') {
if let Some(without_quotes) = text.get(1..end) {
if let Err((off, err)) = unescape::unescape_char(without_quotes) {
let off = token.range().start() + TextUnit::from_usize(off + 1);
acc.push(SyntaxError::new(err.into(), off))
}
}
}
}
BYTE_STRING => {
if let Some(end) = text.rfind('\"') {
if let Some(without_quotes) = text.get(2..end) {
unescape::unescape_byte_str(without_quotes, &mut |range, char| {
if let Err(err) = char {
let off = range.start;
let off = token.range().start() + TextUnit::from_usize(off + 2);
acc.push(SyntaxError::new(err.into(), off))
}
})
}
}
}
STRING => {
if let Some(end) = text.rfind('\"') {
if let Some(without_quotes) = text.get(1..end) {
unescape::unescape_str(without_quotes, &mut |range, char| {
if let Err(err) = char {
let off = range.start;
let off = token.range().start() + TextUnit::from_usize(off + 1);
acc.push(SyntaxError::new(err.into(), off))
}
})
}
}
}
_ => (),
}
}

View File

@ -1,199 +0,0 @@
//! Validation of byte literals
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
validation::char,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_byte_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedByte, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyByte, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongByte, literal_range));
}
}
pub(super) fn validate_byte_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_byte_escape(text, range, errors),
AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
CodePoint => {
let c = text.chars().next().expect("Code points should be one character long");
// These bytes must always be escaped
if c == '\t' || c == '\r' || c == '\n' {
errors.push(SyntaxError::new(UnescapedByte, range));
}
// Only ASCII bytes are allowed
if c > 0x7F as char {
errors.push(SyntaxError::new(ByteOutOfRange, range));
}
}
IgnoreNewline => { /* always valid */ }
}
}
fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\`
errors.push(SyntaxError::new(EmptyByteEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !char::is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidByteEscape, range));
}
}
}
fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// A ByteCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
} else {
assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
if u8::from_str_radix(&text[2..], 16).is_err() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: u8 = b'{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_byte(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_byte(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_byte(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_byte(c);
}
}
}

View File

@ -1,169 +0,0 @@
use crate::{
string_lexing::{self, StringComponentKind},
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::byte;
pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
match component.kind {
StringComponentKind::IgnoreNewline => { /* always valid */ }
_ => {
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => byte::validate_byte_component(text, component.kind, range, errors),
}
}
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_str(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_str(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed_invalid() {
assert_invalid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_mixed_valid() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, no emoji at all,
nor unicode escapes or weird stuff",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View File

@ -1,273 +0,0 @@
//! Validation of char literals
use std::u32;
use arrayvec::ArrayString;
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_char_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedChar, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyChar, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongChar, literal_range));
}
}
pub(super) fn validate_char_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
// Validate escapes
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_ascii_escape(text, range, errors),
AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
UnicodeEscape => validate_unicode_escape(text, range, errors),
CodePoint => {
// These code points must always be escaped
if text == "\t" || text == "\r" || text == "\n" {
errors.push(SyntaxError::new(UnescapedCodepoint, range));
}
}
StringComponentKind::IgnoreNewline => { /* always valid */ }
}
}
fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
errors.push(SyntaxError::new(EmptyAsciiEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidAsciiEscape, range));
}
}
}
pub(super) fn is_ascii_escape(code: char) -> bool {
match code {
'\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
_ => false,
}
}
fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// An AsciiCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
// FIXME: Give a more precise error message (say what the invalid character was)
errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
} else {
assert_eq!(
text.chars().count(),
4,
"AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
text,
);
match u8::from_str_radix(&text[2..], 16) {
Ok(code) if code < 128 => { /* Escape code is valid */ }
Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
}
}
}
fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
if text.len() == 2 {
// No starting `{`
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
if text.len() == 3 {
// Only starting `{`
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
return;
}
let mut code = ArrayString::<[_; 6]>::new();
let mut closed = false;
for c in text[3..].chars() {
assert!(!closed, "no characters after escape is closed");
if c.is_digit(16) {
if code.len() == 6 {
errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
return;
}
code.push(c);
} else if c == '_' {
// Reject leading _
if code.len() == 0 {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
} else if c == '}' {
closed = true;
} else {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
}
if !closed {
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
}
if code.len() == 0 {
errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
return;
}
match u32::from_str_radix(&code, 16) {
Ok(code_u32) if code_u32 > 0x10FFFF => {
errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
}
Ok(_) => {
// Valid escape code
}
Err(_) => {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: char = '{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_char(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_char(c);
}
}
}

View File

@ -1,154 +0,0 @@
use crate::{
string_lexing,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::char;
pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => char::validate_char_component(text, component.kind, range, errors),
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static str = "{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let valid = ["नी", "👨‍👨‍"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View File

@ -0,0 +1,521 @@
//! Utilities for validating string and char literals and turning them into
//! values they represent.
//!
//! This file is copy-pasted from the compiler
//!
//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
//!
//! Hopefully, we'll share this code in a proper way some day
use std::str::Chars;
use std::ops::Range;
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub enum EscapeError {
ZeroChars,
MoreThanOneChar,
LoneSlash,
InvalidEscape,
BareCarriageReturn,
EscapeOnlyChar,
TooShortHexEscape,
InvalidCharInHexEscape,
OutOfRangeHexEscape,
NoBraceInUnicodeEscape,
InvalidCharInUnicodeEscape,
EmptyUnicodeEscape,
UnclosedUnicodeEscape,
LeadingUnderscoreUnicodeEscape,
OverlongUnicodeEscape,
LoneSurrogateUnicodeEscape,
OutOfRangeUnicodeEscape,
UnicodeEscapeInByte,
NonAsciiCharInByte,
}
/// Takes a contents of a char literal (without quotes), and returns an
/// unescaped char or an error
pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
.map(byte_from_char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
callback(range, char.map(byte_from_char))
})
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Str,
Byte,
ByteStr,
}
impl Mode {
fn in_single_quotes(self) -> bool {
match self {
Mode::Char | Mode::Byte => true,
Mode::Str | Mode::ByteStr => false,
}
}
pub(crate) fn in_double_quotes(self) -> bool {
!self.in_single_quotes()
}
pub(crate) fn is_bytes(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr => true,
Mode::Char | Mode::Str => false,
}
}
}
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
}
};
}
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
'"' => '"',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'0' => '\0',
'x' => {
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo;
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
let value = value as u8;
value as char
}
'u' => {
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
if mode.is_bytes() {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit =
c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
continue;
}
let digit = digit as u32;
value = value * 16 + digit;
}
};
}
}
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = scan_escape(first_char, chars, mode)?;
if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
}
Ok(res)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
assert!(mode.in_double_quotes());
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();
let unescaped_char = match first_char {
'\\' => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
}
fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
let str = chars.as_str();
let first_non_space = str
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(str.len());
*chars = str[first_non_space..].chars()
}
}
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
res as u8
}
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unescape_char_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\u{0}x", EscapeError::MoreThanOneChar);
check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\xff", EscapeError::OutOfRangeHexEscape);
check(r"\xFF", EscapeError::OutOfRangeHexEscape);
check(r"\x80", EscapeError::OutOfRangeHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
}
#[test]
fn test_unescape_char_good() {
fn check(literal_text: &str, expected_char: char) {
let actual_result = unescape_char(literal_text);
assert_eq!(actual_result, Ok(expected_char));
}
check("a", 'a');
check("ы", 'ы');
check("🦀", '🦀');
check(r#"\""#, '"');
check(r"\n", '\n');
check(r"\r", '\r');
check(r"\t", '\t');
check(r"\\", '\\');
check(r"\'", '\'');
check(r"\0", '\0');
check(r"\x00", '\0');
check(r"\x5a", 'Z');
check(r"\x5A", 'Z');
check(r"\x7f", 127 as char);
check(r"\u{0}", '\0');
check(r"\u{000000}", '\0');
check(r"\u{41}", 'A');
check(r"\u{0041}", 'A');
check(r"\u{00_41}", 'A');
check(r"\u{4__1__}", 'A');
check(r"\u{1F63b}", '😻');
}
#[test]
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", "foo");
check("", "");
check(" \t\n\r\n", " \t\n\n");
check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}
#[test]
fn test_unescape_byte_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check("ы", EscapeError::NonAsciiCharInByte);
check("🦀", EscapeError::NonAsciiCharInByte);
check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
}
#[test]
fn test_unescape_byte_good() {
fn check(literal_text: &str, expected_byte: u8) {
let actual_result = unescape_byte(literal_text);
assert_eq!(actual_result, Ok(expected_byte));
}
check("a", b'a');
check(r#"\""#, b'"');
check(r"\n", b'\n');
check(r"\r", b'\r');
check(r"\t", b'\t');
check(r"\\", b'\\');
check(r"\'", b'\'');
check(r"\0", b'\0');
check(r"\x00", b'\0');
check(r"\x5a", b'Z');
check(r"\x5A", b'Z');
check(r"\x7f", 127);
check(r"\x80", 128);
check(r"\xff", 255);
check(r"\xFF", 255);
}
#[test]
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_byte_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", b"foo");
check("", b"");
check(" \t\n\r\n", b" \t\n\n");
check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}
}

View File

@ -40,7 +40,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[43; 44) " "
LITERAL@[44; 59)
STRING@[44; 59) "\"string\"invalid"
err: `Invalid literal suffix`
SEMI@[59; 60) ";"
WHITESPACE@[60; 65) "\n "
LET_STMT@[65; 83)
@ -53,7 +52,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[72; 73) " "
LITERAL@[73; 82)
BYTE@[73; 82) "b\'b\'_suff"
err: `Invalid literal suffix`
SEMI@[82; 83) ";"
WHITESPACE@[83; 88) "\n "
LET_STMT@[88; 109)
@ -66,7 +64,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[95; 96) " "
LITERAL@[96; 108)
BYTE_STRING@[96; 108) "b\"bs\"invalid"
err: `Invalid literal suffix`
SEMI@[108; 109) ";"
WHITESPACE@[109; 110) "\n"
R_CURLY@[110; 111) "}"