1253: Share literal validation logic with compiler r=matklad a=matklad

This is neat: the unescape module is literary what compiler is using right now:

c6ac575648/src/libsyntax/parse/unescape.rs

So, yeah, code sharing via copy-paste!



Co-authored-by: Aleksey Kladov <aleksey.kladov@gmail.com>
This commit is contained in:
bors[bot] 2019-05-07 16:43:10 +00:00
commit d3efedb752
10 changed files with 620 additions and 1201 deletions

View File

@ -23,7 +23,6 @@ mod syntax_node;
mod syntax_text; mod syntax_text;
mod syntax_error; mod syntax_error;
mod parsing; mod parsing;
mod string_lexing;
mod validation; mod validation;
mod ptr; mod ptr;

View File

@ -1,333 +0,0 @@
use crate::{TextRange, TextUnit};
use self::StringComponentKind::*;
#[derive(Debug, Eq, PartialEq, Clone)]
pub(crate) struct StringComponent {
pub(crate) range: TextRange,
pub(crate) kind: StringComponentKind,
}
#[derive(Debug, Eq, PartialEq, Clone)]
pub(crate) enum StringComponentKind {
IgnoreNewline,
CodePoint,
AsciiEscape,
AsciiCodeEscape,
UnicodeEscape,
}
pub(crate) fn parse_quoted_literal(
prefix: Option<char>,
quote: char,
src: &str,
) -> StringComponentIter {
let prefix = prefix.map(|p| match p {
'b' => b'b',
_ => panic!("invalid prefix"),
});
let quote = match quote {
'\'' => b'\'',
'"' => b'"',
_ => panic!("invalid quote"),
};
StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None }
}
pub(crate) struct StringComponentIter<'a> {
src: &'a str,
prefix: Option<u8>,
quote: u8,
pos: usize,
pub(crate) has_closing_quote: bool,
pub(crate) suffix: Option<TextRange>,
}
impl<'a> Iterator for StringComponentIter<'a> {
type Item = StringComponent;
fn next(&mut self) -> Option<StringComponent> {
if self.pos == 0 {
if let Some(prefix) = self.prefix {
assert!(
self.advance() == prefix as char,
"literal should start with a {:?}",
prefix as char,
);
}
assert!(
self.advance() == self.quote as char,
"literal should start with a {:?}",
self.quote as char,
);
}
if let Some(component) = self.parse_component() {
return Some(component);
}
// We get here when there are no char components left to parse
if self.peek() == Some(self.quote as char) {
self.advance();
self.has_closing_quote = true;
if let Some(range) = self.parse_suffix() {
self.suffix = Some(range);
}
}
assert!(
self.peek() == None,
"literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
self.src,
self.pos,
self.src.len()
);
None
}
}
impl<'a> StringComponentIter<'a> {
fn peek(&self) -> Option<char> {
if self.pos == self.src.len() {
return None;
}
self.src[self.pos..].chars().next()
}
fn advance(&mut self) -> char {
let next = self.peek().expect("cannot advance if end of input is reached");
self.pos += next.len_utf8();
next
}
fn parse_component(&mut self) -> Option<StringComponent> {
let next = self.peek()?;
// Ignore string close
if next == self.quote as char {
return None;
}
let start = self.start_range();
self.advance();
if next == '\\' {
// Strings can use `\` to ignore newlines, so we first try to parse one of those
// before falling back to parsing char escapes
if self.quote == b'"' {
if let Some(component) = self.parse_ignore_newline(start) {
return Some(component);
}
}
Some(self.parse_escape(start))
} else {
Some(self.finish_component(start, CodePoint))
}
}
fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
// In string literals, when a `\` occurs immediately before the newline, the `\`,
// the newline, and all whitespace at the beginning of the next line are ignored
match self.peek() {
Some('\n') | Some('\r') => {
self.skip_whitespace();
Some(self.finish_component(start, IgnoreNewline))
}
_ => None,
}
}
fn skip_whitespace(&mut self) {
while self.peek().map(|c| c.is_whitespace()) == Some(true) {
self.advance();
}
}
fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
if self.peek().is_none() {
return self.finish_component(start, AsciiEscape);
}
let next = self.advance();
match next {
'x' => self.parse_ascii_code_escape(start),
'u' => self.parse_unicode_escape(start),
_ => self.finish_component(start, AsciiEscape),
}
}
fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
match self.peek() {
Some('{') => {
self.advance();
// Parse anything until we reach `}`
while let Some(next) = self.peek() {
self.advance();
if next == '}' {
break;
}
}
self.finish_component(start, UnicodeEscape)
}
Some(_) | None => self.finish_component(start, UnicodeEscape),
}
}
fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
let code_start = self.pos;
while let Some(next) = self.peek() {
if next == '\'' || (self.pos - code_start == 2) {
break;
}
self.advance();
}
self.finish_component(start, AsciiCodeEscape)
}
fn parse_suffix(&mut self) -> Option<TextRange> {
let start = self.start_range();
let _ = self.peek()?;
while let Some(_) = self.peek() {
self.advance();
}
Some(self.finish_range(start))
}
fn start_range(&self) -> TextUnit {
TextUnit::from_usize(self.pos)
}
fn finish_range(&self, start: TextUnit) -> TextRange {
TextRange::from_to(start, TextUnit::from_usize(self.pos))
}
fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent {
let range = self.finish_range(start);
StringComponent { range, kind }
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(src: &str) -> (bool, Vec<StringComponent>) {
let component_iterator = &mut parse_quoted_literal(None, '\'', src);
let components: Vec<_> = component_iterator.collect();
(component_iterator.has_closing_quote, components)
}
fn unclosed_char_component(src: &str) -> StringComponent {
let (has_closing_quote, components) = parse(src);
assert!(!has_closing_quote, "char should not have closing quote");
assert!(components.len() == 1);
components[0].clone()
}
fn closed_char_component(src: &str) -> StringComponent {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
components[0].clone()
}
fn closed_char_components(src: &str) -> Vec<StringComponent> {
let (has_closing_quote, components) = parse(src);
assert!(has_closing_quote, "char should have closing quote");
components
}
fn range_closed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
}
fn range_unclosed(src: &str) -> TextRange {
TextRange::from_to(1.into(), (src.len() as u32).into())
}
#[test]
fn test_unicode_escapes() {
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = closed_char_component(&escape_sequence);
let expected_range = range_closed(&escape_sequence);
assert_eq!(component.kind, UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}
#[test]
fn test_unicode_escapes_unclosed() {
let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = unclosed_char_component(&escape_sequence);
let expected_range = range_unclosed(&escape_sequence);
assert_eq!(component.kind, UnicodeEscape);
assert_eq!(component.range, expected_range);
}
}
#[test]
fn test_empty_char() {
let (has_closing_quote, components) = parse("''");
assert!(has_closing_quote, "char should have closing quote");
assert!(components.len() == 0);
}
#[test]
fn test_unclosed_char() {
let component = unclosed_char_component("'a");
assert!(component.kind == CodePoint);
assert!(component.range == TextRange::from_to(1.into(), 2.into()));
}
#[test]
fn test_digit_escapes() {
let literals = &[r"", r"5", r"55"];
for literal in literals {
let lit_text = format!(r"'\x{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == AsciiCodeEscape);
assert!(component.range == range_closed(&lit_text));
}
// More than 2 digits starts a new codepoint
let components = closed_char_components(r"'\x555'");
assert!(components.len() == 2);
assert!(components[1].kind == CodePoint);
}
#[test]
fn test_ascii_escapes() {
let literals = &[
r"\'", "\\\"", // equivalent to \"
r"\n", r"\r", r"\t", r"\\", r"\0",
];
for literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == AsciiEscape);
assert!(component.range == range_closed(&lit_text));
}
}
#[test]
fn test_no_escapes() {
let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
for &literal in literals {
let lit_text = format!("'{}'", literal);
let component = closed_char_component(&lit_text);
assert!(component.kind == CodePoint);
assert!(component.range == range_closed(&lit_text));
}
}
}

View File

@ -2,7 +2,10 @@ use std::fmt;
use ra_parser::ParseError; use ra_parser::ParseError;
use crate::{TextRange, TextUnit}; use crate::{
TextRange, TextUnit,
validation::EscapeError,
};
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SyntaxError { pub struct SyntaxError {
@ -67,32 +70,7 @@ impl fmt::Display for SyntaxError {
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum SyntaxErrorKind { pub enum SyntaxErrorKind {
ParseError(ParseError), ParseError(ParseError),
UnescapedCodepoint, EscapeError(EscapeError),
EmptyChar,
UnclosedChar,
OverlongChar,
EmptyByte,
UnclosedByte,
OverlongByte,
ByteOutOfRange,
UnescapedByte,
EmptyByteEscape,
InvalidByteEscape,
TooShortByteCodeEscape,
MalformedByteCodeEscape,
UnicodeEscapeForbidden,
EmptyAsciiEscape,
InvalidAsciiEscape,
TooShortAsciiCodeEscape,
AsciiCodeEscapeOutOfRange,
MalformedAsciiCodeEscape,
UnclosedUnicodeEscape,
MalformedUnicodeEscape,
EmptyUnicodeEcape,
OverlongUnicodeEscape,
UnicodeEscapeOutOfRange,
UnclosedString,
InvalidSuffix,
InvalidBlockAttr, InvalidBlockAttr,
InvalidMatchInnerAttr, InvalidMatchInnerAttr,
InvalidTupleIndexFormat, InvalidTupleIndexFormat,
@ -102,38 +80,6 @@ impl fmt::Display for SyntaxErrorKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::SyntaxErrorKind::*; use self::SyntaxErrorKind::*;
match self { match self {
UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
EmptyAsciiEscape => write!(f, "Empty escape sequence"),
InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
EmptyChar => write!(f, "Empty char literal"),
UnclosedChar => write!(f, "Unclosed char literal"),
OverlongChar => write!(f, "Char literal should be one character long"),
EmptyByte => write!(f, "Empty byte literal"),
UnclosedByte => write!(f, "Unclosed byte literal"),
OverlongByte => write!(f, "Byte literal should be one character long"),
ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"),
UnescapedByte => write!(f, "This byte should always be escaped"),
EmptyByteEscape => write!(f, "Empty escape sequence"),
InvalidByteEscape => write!(f, "Invalid escape sequence"),
TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"),
MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
UnicodeEscapeForbidden => {
write!(f, "Unicode escapes are not allowed in byte literals or byte strings")
}
TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
AsciiCodeEscapeOutOfRange => {
write!(f, "Escape sequence should be between \\x00 and \\x7F")
}
MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
OverlongUnicodeEscape => {
write!(f, "Unicode escape sequence should have at most 6 digits")
}
UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
UnclosedString => write!(f, "Unclosed string literal"),
InvalidSuffix => write!(f, "Invalid literal suffix"),
InvalidBlockAttr => { InvalidBlockAttr => {
write!(f, "A block in this position cannot accept inner attributes") write!(f, "A block in this position cannot accept inner attributes")
} }
@ -144,6 +90,46 @@ impl fmt::Display for SyntaxErrorKind {
write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix") write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix")
} }
ParseError(msg) => write!(f, "{}", msg.0), ParseError(msg) => write!(f, "{}", msg.0),
EscapeError(err) => write!(f, "{}", err),
} }
} }
} }
impl fmt::Display for EscapeError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let msg = match self {
EscapeError::ZeroChars => "Empty literal",
EscapeError::MoreThanOneChar => "Literal should be one character long",
EscapeError::LoneSlash => "Character must be escaped: '\\'",
EscapeError::InvalidEscape => "Invalid escape sequence",
EscapeError::BareCarriageReturn => "Character must be escaped: '\r'",
EscapeError::EscapeOnlyChar => "Character must be escaped",
EscapeError::TooShortHexEscape => "Escape sequence should have two digits",
EscapeError::InvalidCharInHexEscape => "Escape sequence should be a hexadecimal number",
EscapeError::OutOfRangeHexEscape => "Escape sequence should be ASCII",
EscapeError::NoBraceInUnicodeEscape => "Invalid escape sequence",
EscapeError::InvalidCharInUnicodeEscape => "Invalid escape sequence",
EscapeError::EmptyUnicodeEscape => "Invalid escape sequence",
EscapeError::UnclosedUnicodeEscape => "Missing '}'",
EscapeError::LeadingUnderscoreUnicodeEscape => "Invalid escape sequence",
EscapeError::OverlongUnicodeEscape => {
"Unicode escape sequence should have at most 6 digits"
}
EscapeError::LoneSurrogateUnicodeEscape => {
"Unicode escape code should not be a surrogate"
}
EscapeError::OutOfRangeUnicodeEscape => {
"Unicode escape code should be at most 0x10FFFF"
}
EscapeError::UnicodeEscapeInByte => "Unicode escapes are not allowed in bytes",
EscapeError::NonAsciiCharInByte => "Non ASCII characters are not allowed in bytes",
};
write!(f, "{}", msg)
}
}
impl From<EscapeError> for SyntaxErrorKind {
fn from(err: EscapeError) -> Self {
SyntaxErrorKind::EscapeError(err)
}
}

View File

@ -1,17 +1,17 @@
mod byte; mod unescape;
mod byte_string;
mod char;
mod string;
mod block; mod block;
mod field_expr; mod field_expr;
use crate::{ use crate::{
SourceFile, SyntaxError, AstNode, SyntaxNode, SourceFile, SyntaxError, AstNode, SyntaxNode, TextUnit,
SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR}, SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR},
ast, ast,
algo::visit::{visitor_ctx, VisitorCtx}, algo::visit::{visitor_ctx, VisitorCtx},
}; };
pub(crate) use unescape::EscapeError;
pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> { pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
let mut errors = Vec::new(); let mut errors = Vec::new();
for node in file.syntax().descendants() { for node in file.syntax().descendants() {
@ -26,11 +26,55 @@ pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
// FIXME: kill duplication // FIXME: kill duplication
fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) { fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) {
match literal.token().kind() { let token = literal.token();
BYTE => byte::validate_byte_node(literal.token(), acc), let text = token.text().as_str();
BYTE_STRING => byte_string::validate_byte_string_node(literal.token(), acc), match token.kind() {
STRING => string::validate_string_node(literal.token(), acc), BYTE => {
CHAR => char::validate_char_node(literal.token(), acc), if let Some(end) = text.rfind('\'') {
if let Some(without_quotes) = text.get(2..end) {
if let Err((off, err)) = unescape::unescape_byte(without_quotes) {
let off = token.range().start() + TextUnit::from_usize(off + 2);
acc.push(SyntaxError::new(err.into(), off))
}
}
}
}
CHAR => {
if let Some(end) = text.rfind('\'') {
if let Some(without_quotes) = text.get(1..end) {
if let Err((off, err)) = unescape::unescape_char(without_quotes) {
let off = token.range().start() + TextUnit::from_usize(off + 1);
acc.push(SyntaxError::new(err.into(), off))
}
}
}
}
BYTE_STRING => {
if let Some(end) = text.rfind('\"') {
if let Some(without_quotes) = text.get(2..end) {
unescape::unescape_byte_str(without_quotes, &mut |range, char| {
if let Err(err) = char {
let off = range.start;
let off = token.range().start() + TextUnit::from_usize(off + 2);
acc.push(SyntaxError::new(err.into(), off))
}
})
}
}
}
STRING => {
if let Some(end) = text.rfind('\"') {
if let Some(without_quotes) = text.get(1..end) {
unescape::unescape_str(without_quotes, &mut |range, char| {
if let Err(err) = char {
let off = range.start;
let off = token.range().start() + TextUnit::from_usize(off + 1);
acc.push(SyntaxError::new(err.into(), off))
}
})
}
}
}
_ => (), _ => (),
} }
} }

View File

@ -1,199 +0,0 @@
//! Validation of byte literals
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
validation::char,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_byte_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedByte, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyByte, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongByte, literal_range));
}
}
pub(super) fn validate_byte_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_byte_escape(text, range, errors),
AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
CodePoint => {
let c = text.chars().next().expect("Code points should be one character long");
// These bytes must always be escaped
if c == '\t' || c == '\r' || c == '\n' {
errors.push(SyntaxError::new(UnescapedByte, range));
}
// Only ASCII bytes are allowed
if c > 0x7F as char {
errors.push(SyntaxError::new(ByteOutOfRange, range));
}
}
IgnoreNewline => { /* always valid */ }
}
}
fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\`
errors.push(SyntaxError::new(EmptyByteEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !char::is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidByteEscape, range));
}
}
}
fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// A ByteCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
} else {
assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
if u8::from_str_radix(&text[2..], 16).is_err() {
errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: u8 = b'{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_byte(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_byte(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_byte(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_valid_byte_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_byte(c);
}
}
#[test]
fn test_invalid_byte_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_byte(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_byte(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_byte(c);
}
}
}

View File

@ -1,169 +0,0 @@
use crate::{
string_lexing::{self, StringComponentKind},
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::byte;
pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
match component.kind {
StringComponentKind::IgnoreNewline => { /* always valid */ }
_ => {
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => byte::validate_byte_component(text, component.kind, range, errors),
}
}
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..128 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
for byte in 128..=255u8 {
assert_invalid_str(&(byte as char).to_string());
}
}
#[test]
fn test_unicode_codepoints() {
let invalid = ["Ƒ", "", "", ""];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &well_formed {
assert_invalid_str(c);
}
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed_invalid() {
assert_invalid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_mixed_valid() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, no emoji at all,
nor unicode escapes or weird stuff",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View File

@ -1,273 +0,0 @@
//! Validation of char literals
use std::u32;
use arrayvec::ArrayString;
use crate::{
string_lexing::{self, StringComponentKind},
TextRange,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
let mut len = 0;
for component in &mut components {
len += 1;
let text = &literal_text[component.range];
let range = component.range + literal_range.start();
validate_char_component(text, component.kind, range, errors);
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedChar, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
if len == 0 {
errors.push(SyntaxError::new(EmptyChar, literal_range));
}
if len > 1 {
errors.push(SyntaxError::new(OverlongChar, literal_range));
}
}
pub(super) fn validate_char_component(
text: &str,
kind: StringComponentKind,
range: TextRange,
errors: &mut Vec<SyntaxError>,
) {
// Validate escapes
use self::StringComponentKind::*;
match kind {
AsciiEscape => validate_ascii_escape(text, range, errors),
AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
UnicodeEscape => validate_unicode_escape(text, range, errors),
CodePoint => {
// These code points must always be escaped
if text == "\t" || text == "\r" || text == "\n" {
errors.push(SyntaxError::new(UnescapedCodepoint, range));
}
}
StringComponentKind::IgnoreNewline => { /* always valid */ }
}
}
fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
if text.len() == 1 {
// Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
errors.push(SyntaxError::new(EmptyAsciiEscape, range));
} else {
let escape_code = text.chars().skip(1).next().unwrap();
if !is_ascii_escape(escape_code) {
errors.push(SyntaxError::new(InvalidAsciiEscape, range));
}
}
}
pub(super) fn is_ascii_escape(code: char) -> bool {
match code {
'\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
_ => false,
}
}
fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
// An AsciiCodeEscape has 4 chars, example: `\xDD`
if !text.is_ascii() {
// FIXME: Give a more precise error message (say what the invalid character was)
errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
} else if text.chars().count() < 4 {
errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
} else {
assert_eq!(
text.chars().count(),
4,
"AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
text,
);
match u8::from_str_radix(&text[2..], 16) {
Ok(code) if code < 128 => { /* Escape code is valid */ }
Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
}
}
}
fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
if text.len() == 2 {
// No starting `{`
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
if text.len() == 3 {
// Only starting `{`
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
return;
}
let mut code = ArrayString::<[_; 6]>::new();
let mut closed = false;
for c in text[3..].chars() {
assert!(!closed, "no characters after escape is closed");
if c.is_digit(16) {
if code.len() == 6 {
errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
return;
}
code.push(c);
} else if c == '_' {
// Reject leading _
if code.len() == 0 {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
} else if c == '}' {
closed = true;
} else {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
}
if !closed {
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
}
if code.len() == 0 {
errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
return;
}
match u32::from_str_radix(&code, 16) {
Ok(code_u32) if code_u32 > 0x10FFFF => {
errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
}
Ok(_) => {
// Valid escape code
}
Err(_) => {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
}
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!("const C: char = '{}';", literal);
SourceFile::parse(&src)
}
fn assert_valid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_char(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_char(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_char(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_char(c);
}
}
}

View File

@ -1,154 +0,0 @@
use crate::{
string_lexing,
SyntaxError,
SyntaxErrorKind::*,
SyntaxToken,
};
use super::char;
pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
let literal_text = node.text();
let literal_range = node.range();
let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
for component in &mut components {
let range = component.range + literal_range.start();
// Chars must escape \t, \n and \r codepoints, but strings don't
let text = &literal_text[component.range];
match text {
"\t" | "\n" | "\r" => { /* always valid */ }
_ => char::validate_char_component(text, component.kind, range, errors),
}
}
if !components.has_closing_quote {
errors.push(SyntaxError::new(UnclosedString, literal_range));
}
if let Some(range) = components.suffix {
errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
}
}
#[cfg(test)]
mod test {
use crate::{SourceFile, TreeArc};
fn build_file(literal: &str) -> TreeArc<SourceFile> {
let src = format!(r#"const S: &'static str = "{}";"#, literal);
println!("Source: {}", src);
SourceFile::parse(&src)
}
fn assert_valid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
}
fn assert_invalid_str(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}
#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\"' | b'\\' => { /* Ignore string close and backslash */ }
_ => assert_valid_str(&(byte as char).to_string()),
}
}
}
#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "", "", ""];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_unicode_multiple_codepoints() {
let valid = ["नी", "👨‍👨‍"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_valid_ascii_escape() {
let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_valid_unicode_escape() {
let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
for c in &valid {
assert_valid_str(c);
}
}
#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_str(c);
}
}
#[test]
fn test_mixed() {
assert_valid_str(
r"This is the tale of a string
with a newline in between, some emoji (👨👨) here and there,
unicode escapes like this: \u{1FFBB} and weird stuff like
this ",
);
}
#[test]
fn test_ignore_newline() {
assert_valid_str(
"Hello \
World",
);
}
}

View File

@ -0,0 +1,521 @@
//! Utilities for validating string and char literals and turning them into
//! values they represent.
//!
//! This file is copy-pasted from the compiler
//!
//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
//!
//! Hopefully, we'll share this code in a proper way some day
use std::str::Chars;
use std::ops::Range;
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub enum EscapeError {
ZeroChars,
MoreThanOneChar,
LoneSlash,
InvalidEscape,
BareCarriageReturn,
EscapeOnlyChar,
TooShortHexEscape,
InvalidCharInHexEscape,
OutOfRangeHexEscape,
NoBraceInUnicodeEscape,
InvalidCharInUnicodeEscape,
EmptyUnicodeEscape,
UnclosedUnicodeEscape,
LeadingUnderscoreUnicodeEscape,
OverlongUnicodeEscape,
LoneSurrogateUnicodeEscape,
OutOfRangeUnicodeEscape,
UnicodeEscapeInByte,
NonAsciiCharInByte,
}
/// Takes a contents of a char literal (without quotes), and returns an
/// unescaped char or an error
pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
.map(byte_from_char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
callback(range, char.map(byte_from_char))
})
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Str,
Byte,
ByteStr,
}
impl Mode {
fn in_single_quotes(self) -> bool {
match self {
Mode::Char | Mode::Byte => true,
Mode::Str | Mode::ByteStr => false,
}
}
pub(crate) fn in_double_quotes(self) -> bool {
!self.in_single_quotes()
}
pub(crate) fn is_bytes(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr => true,
Mode::Char | Mode::Str => false,
}
}
}
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
}
};
}
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
'"' => '"',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'0' => '\0',
'x' => {
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo;
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
let value = value as u8;
value as char
}
'u' => {
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
if mode.is_bytes() {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit =
c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
continue;
}
let digit = digit as u32;
value = value * 16 + digit;
}
};
}
}
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = scan_escape(first_char, chars, mode)?;
if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
}
Ok(res)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
assert!(mode.in_double_quotes());
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();
let unescaped_char = match first_char {
'\\' => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
}
fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
let str = chars.as_str();
let first_non_space = str
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(str.len());
*chars = str[first_non_space..].chars()
}
}
fn byte_from_char(c: char) -> u8 {
let res = c as u32;
assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
res as u8
}
fn is_ascii(x: u32) -> bool {
x <= 0x7F
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unescape_char_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\u{0}x", EscapeError::MoreThanOneChar);
check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\xff", EscapeError::OutOfRangeHexEscape);
check(r"\xFF", EscapeError::OutOfRangeHexEscape);
check(r"\x80", EscapeError::OutOfRangeHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
}
#[test]
fn test_unescape_char_good() {
fn check(literal_text: &str, expected_char: char) {
let actual_result = unescape_char(literal_text);
assert_eq!(actual_result, Ok(expected_char));
}
check("a", 'a');
check("ы", 'ы');
check("🦀", '🦀');
check(r#"\""#, '"');
check(r"\n", '\n');
check(r"\r", '\r');
check(r"\t", '\t');
check(r"\\", '\\');
check(r"\'", '\'');
check(r"\0", '\0');
check(r"\x00", '\0');
check(r"\x5a", 'Z');
check(r"\x5A", 'Z');
check(r"\x7f", 127 as char);
check(r"\u{0}", '\0');
check(r"\u{000000}", '\0');
check(r"\u{41}", 'A');
check(r"\u{0041}", 'A');
check(r"\u{00_41}", 'A');
check(r"\u{4__1__}", 'A');
check(r"\u{1F63b}", '😻');
}
#[test]
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", "foo");
check("", "");
check(" \t\n\r\n", " \t\n\n");
check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}
#[test]
fn test_unescape_byte_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
}
check("", EscapeError::ZeroChars);
check(r"\", EscapeError::LoneSlash);
check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
check("spam", EscapeError::MoreThanOneChar);
check(r"\x0ff", EscapeError::MoreThanOneChar);
check(r#"\"a"#, EscapeError::MoreThanOneChar);
check(r"\na", EscapeError::MoreThanOneChar);
check(r"\ra", EscapeError::MoreThanOneChar);
check(r"\ta", EscapeError::MoreThanOneChar);
check(r"\\a", EscapeError::MoreThanOneChar);
check(r"\'a", EscapeError::MoreThanOneChar);
check(r"\0a", EscapeError::MoreThanOneChar);
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
check(r"\xa", EscapeError::TooShortHexEscape);
check(r"\xf", EscapeError::TooShortHexEscape);
check(r"\xx", EscapeError::InvalidCharInHexEscape);
check(r"\xы", EscapeError::InvalidCharInHexEscape);
check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
check(r"\xtt", EscapeError::InvalidCharInHexEscape);
check(r"\u", EscapeError::NoBraceInUnicodeEscape);
check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
check(r"\u{", EscapeError::UnclosedUnicodeEscape);
check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
check(r"\u{}", EscapeError::EmptyUnicodeEscape);
check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
check("ы", EscapeError::NonAsciiCharInByte);
check("🦀", EscapeError::NonAsciiCharInByte);
check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
}
#[test]
fn test_unescape_byte_good() {
fn check(literal_text: &str, expected_byte: u8) {
let actual_result = unescape_byte(literal_text);
assert_eq!(actual_result, Ok(expected_byte));
}
check("a", b'a');
check(r#"\""#, b'"');
check(r"\n", b'\n');
check(r"\r", b'\r');
check(r"\t", b'\t');
check(r"\\", b'\\');
check(r"\'", b'\'');
check(r"\0", b'\0');
check(r"\x00", b'\0');
check(r"\x5a", b'Z');
check(r"\x5A", b'Z');
check(r"\x7f", 127);
check(r"\x80", 128);
check(r"\xff", 255);
check(r"\xFF", 255);
}
#[test]
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_byte_str(literal_text, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Err(e) => buf = Err((range, e)),
}
}
});
let buf = buf.as_ref().map(|it| it.as_ref());
assert_eq!(buf, Ok(expected))
}
check("foo", b"foo");
check("", b"");
check(" \t\n\r\n", b" \t\n\n");
check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}
}

View File

@ -40,7 +40,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[43; 44) " " WHITESPACE@[43; 44) " "
LITERAL@[44; 59) LITERAL@[44; 59)
STRING@[44; 59) "\"string\"invalid" STRING@[44; 59) "\"string\"invalid"
err: `Invalid literal suffix`
SEMI@[59; 60) ";" SEMI@[59; 60) ";"
WHITESPACE@[60; 65) "\n " WHITESPACE@[60; 65) "\n "
LET_STMT@[65; 83) LET_STMT@[65; 83)
@ -53,7 +52,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[72; 73) " " WHITESPACE@[72; 73) " "
LITERAL@[73; 82) LITERAL@[73; 82)
BYTE@[73; 82) "b\'b\'_suff" BYTE@[73; 82) "b\'b\'_suff"
err: `Invalid literal suffix`
SEMI@[82; 83) ";" SEMI@[82; 83) ";"
WHITESPACE@[83; 88) "\n " WHITESPACE@[83; 88) "\n "
LET_STMT@[88; 109) LET_STMT@[88; 109)
@ -66,7 +64,6 @@ SOURCE_FILE@[0; 112)
WHITESPACE@[95; 96) " " WHITESPACE@[95; 96) " "
LITERAL@[96; 108) LITERAL@[96; 108)
BYTE_STRING@[96; 108) "b\"bs\"invalid" BYTE_STRING@[96; 108) "b\"bs\"invalid"
err: `Invalid literal suffix`
SEMI@[108; 109) ";" SEMI@[108; 109) ";"
WHITESPACE@[109; 110) "\n" WHITESPACE@[109; 110) "\n"
R_CURLY@[110; 111) "}" R_CURLY@[110; 111) "}"