From cf1caf518122b84b9516e1b9f65ba778f1900bf3 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Tue, 2 Apr 2019 18:18:00 +0300 Subject: [PATCH] simplify --- crates/ra_syntax/src/string_lexing.rs | 338 +++++++++++++++++- crates/ra_syntax/src/string_lexing/parser.rs | 168 --------- crates/ra_syntax/src/string_lexing/string.rs | 222 ------------ crates/ra_syntax/src/validation/byte.rs | 2 +- .../ra_syntax/src/validation/byte_string.rs | 2 +- crates/ra_syntax/src/validation/char.rs | 2 +- crates/ra_syntax/src/validation/string.rs | 2 +- 7 files changed, 336 insertions(+), 400 deletions(-) delete mode 100644 crates/ra_syntax/src/string_lexing/parser.rs delete mode 100644 crates/ra_syntax/src/string_lexing/string.rs diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs index 349733f3fbf..4c3eea3d2e9 100644 --- a/crates/ra_syntax/src/string_lexing.rs +++ b/crates/ra_syntax/src/string_lexing.rs @@ -1,7 +1,333 @@ -mod parser; -mod string; +use crate::{TextRange, TextUnit}; +use self::StringComponentKind::*; -pub use self::{ - parser::{StringComponent, StringComponentKind}, - string::{parse_string_literal, parse_char_literal, parse_byte_literal, parse_byte_string_literal}, -}; +#[derive(Debug, Eq, PartialEq, Clone)] +pub(crate) struct StringComponent { + pub(crate) range: TextRange, + pub(crate) kind: StringComponentKind, +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub(crate) enum StringComponentKind { + IgnoreNewline, + CodePoint, + AsciiEscape, + AsciiCodeEscape, + UnicodeEscape, +} + +pub(crate) fn parse_quoted_literal( + prefix: Option, + quote: char, + src: &str, +) -> StringComponentIter { + let prefix = prefix.map(|p| match p { + 'b' => b'b', + _ => panic!("invalid prefix"), + }); + let quote = match quote { + '\'' => b'\'', + '"' => b'"', + _ => panic!("invalid quote"), + }; + StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None } +} + +pub(crate) struct StringComponentIter<'a> { + src: &'a str, + prefix: Option, + quote: u8, + pos: usize, + pub(crate) has_closing_quote: bool, + pub(crate) suffix: Option, +} + +impl<'a> Iterator for StringComponentIter<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.pos == 0 { + if let Some(prefix) = self.prefix { + assert!( + self.advance() == prefix as char, + "literal should start with a {:?}", + prefix as char, + ); + } + assert!( + self.advance() == self.quote as char, + "literal should start with a {:?}", + self.quote as char, + ); + } + + if let Some(component) = self.parse_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.peek() == Some(self.quote as char) { + self.advance(); + self.has_closing_quote = true; + if let Some(range) = self.parse_suffix() { + self.suffix = Some(range); + } + } + + assert!( + self.peek() == None, + "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}", + self.src, + self.pos, + self.src.len() + ); + + None + } +} + +impl<'a> StringComponentIter<'a> { + fn peek(&self) -> Option { + if self.pos == self.src.len() { + return None; + } + + self.src[self.pos..].chars().next() + } + + fn advance(&mut self) -> char { + let next = self.peek().expect("cannot advance if end of input is reached"); + self.pos += next.len_utf8(); + next + } + + fn parse_component(&mut self) -> Option { + let next = self.peek()?; + + // Ignore string close + if next == self.quote as char { + return None; + } + + let start = self.start_range(); + self.advance(); + + if next == '\\' { + // Strings can use `\` to ignore newlines, so we first try to parse one of those + // before falling back to parsing char escapes + if self.quote == b'"' { + if let Some(component) = self.parse_ignore_newline(start) { + return Some(component); + } + } + + Some(self.parse_escape(start)) + } else { + Some(self.finish_component(start, CodePoint)) + } + } + + fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { + // In string literals, when a `\` occurs immediately before the newline, the `\`, + // the newline, and all whitespace at the beginning of the next line are ignored + match self.peek() { + Some('\n') | Some('\r') => { + self.skip_whitespace(); + Some(self.finish_component(start, IgnoreNewline)) + } + _ => None, + } + } + + fn skip_whitespace(&mut self) { + while self.peek().map(|c| c.is_whitespace()) == Some(true) { + self.advance(); + } + } + + fn parse_escape(&mut self, start: TextUnit) -> StringComponent { + if self.peek().is_none() { + return self.finish_component(start, AsciiEscape); + } + + let next = self.advance(); + match next { + 'x' => self.parse_ascii_code_escape(start), + 'u' => self.parse_unicode_escape(start), + _ => self.finish_component(start, AsciiEscape), + } + } + + fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent { + match self.peek() { + Some('{') => { + self.advance(); + + // Parse anything until we reach `}` + while let Some(next) = self.peek() { + self.advance(); + if next == '}' { + break; + } + } + + self.finish_component(start, UnicodeEscape) + } + Some(_) | None => self.finish_component(start, UnicodeEscape), + } + } + + fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent { + let code_start = self.pos; + while let Some(next) = self.peek() { + if next == '\'' || (self.pos - code_start == 2) { + break; + } + + self.advance(); + } + self.finish_component(start, AsciiCodeEscape) + } + + fn parse_suffix(&mut self) -> Option { + let start = self.start_range(); + let _ = self.peek()?; + while let Some(_) = self.peek() { + self.advance(); + } + Some(self.finish_range(start)) + } + + fn start_range(&self) -> TextUnit { + TextUnit::from_usize(self.pos) + } + + fn finish_range(&self, start: TextUnit) -> TextRange { + TextRange::from_to(start, TextUnit::from_usize(self.pos)) + } + + fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent { + let range = self.finish_range(start); + StringComponent { range, kind } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(src: &str) -> (bool, Vec) { + let component_iterator = &mut parse_quoted_literal(None, '\'', src); + let components: Vec<_> = component_iterator.collect(); + (component_iterator.has_closing_quote, components) + } + + fn unclosed_char_component(src: &str) -> StringComponent { + let (has_closing_quote, components) = parse(src); + assert!(!has_closing_quote, "char should not have closing quote"); + assert!(components.len() == 1); + components[0].clone() + } + + fn closed_char_component(src: &str) -> StringComponent { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components); + components[0].clone() + } + + fn closed_char_components(src: &str) -> Vec { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + components + } + + fn range_closed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) + } + + fn range_unclosed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32).into()) + } + + #[test] + fn test_unicode_escapes() { + let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = closed_char_component(&escape_sequence); + let expected_range = range_closed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_unicode_escapes_unclosed() { + let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = unclosed_char_component(&escape_sequence); + let expected_range = range_unclosed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_empty_char() { + let (has_closing_quote, components) = parse("''"); + assert!(has_closing_quote, "char should have closing quote"); + assert!(components.len() == 0); + } + + #[test] + fn test_unclosed_char() { + let component = unclosed_char_component("'a"); + assert!(component.kind == CodePoint); + assert!(component.range == TextRange::from_to(1.into(), 2.into())); + } + + #[test] + fn test_digit_escapes() { + let literals = &[r"", r"5", r"55"]; + + for literal in literals { + let lit_text = format!(r"'\x{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiCodeEscape); + assert!(component.range == range_closed(&lit_text)); + } + + // More than 2 digits starts a new codepoint + let components = closed_char_components(r"'\x555'"); + assert!(components.len() == 2); + assert!(components[1].kind == CodePoint); + } + + #[test] + fn test_ascii_escapes() { + let literals = &[ + r"\'", "\\\"", // equivalent to \" + r"\n", r"\r", r"\t", r"\\", r"\0", + ]; + + for literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiEscape); + assert!(component.range == range_closed(&lit_text)); + } + } + + #[test] + fn test_no_escapes() { + let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; + + for &literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == CodePoint); + assert!(component.range == range_closed(&lit_text)); + } + } +} diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs deleted file mode 100644 index 7469eb903ed..00000000000 --- a/crates/ra_syntax/src/string_lexing/parser.rs +++ /dev/null @@ -1,168 +0,0 @@ -use rowan::{TextRange, TextUnit}; - -use self::StringComponentKind::*; - -pub struct Parser<'a> { - pub(super) quote: u8, - pub(super) src: &'a str, - pub(super) pos: usize, -} - -impl<'a> Parser<'a> { - pub fn new(src: &'a str, quote: u8) -> Parser<'a> { - Parser { quote, src, pos: 0 } - } - - // Utility methods - - pub fn peek(&self) -> Option { - if self.pos == self.src.len() { - return None; - } - - self.src[self.pos..].chars().next() - } - - pub fn advance(&mut self) -> char { - let next = self.peek().expect("cannot advance if end of input is reached"); - self.pos += next.len_utf8(); - next - } - - pub fn skip_whitespace(&mut self) { - while self.peek().map(|c| c.is_whitespace()) == Some(true) { - self.advance(); - } - } - - pub fn get_pos(&self) -> TextUnit { - (self.pos as u32).into() - } - - // Char parsing methods - - fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent { - match self.peek() { - Some('{') => { - self.advance(); - - // Parse anything until we reach `}` - while let Some(next) = self.peek() { - self.advance(); - if next == '}' { - break; - } - } - - let end = self.get_pos(); - StringComponent::new(TextRange::from_to(start, end), UnicodeEscape) - } - Some(_) | None => { - let end = self.get_pos(); - StringComponent::new(TextRange::from_to(start, end), UnicodeEscape) - } - } - } - - fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent { - let code_start = self.get_pos(); - while let Some(next) = self.peek() { - if next == '\'' || (self.get_pos() - code_start == 2.into()) { - break; - } - - self.advance(); - } - - let end = self.get_pos(); - StringComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) - } - - fn parse_escape(&mut self, start: TextUnit) -> StringComponent { - if self.peek().is_none() { - return StringComponent::new(TextRange::from_to(start, self.get_pos()), AsciiEscape); - } - - let next = self.advance(); - let end = self.get_pos(); - let range = TextRange::from_to(start, end); - match next { - 'x' => self.parse_ascii_code_escape(start), - 'u' => self.parse_unicode_escape(start), - _ => StringComponent::new(range, AsciiEscape), - } - } - - pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { - // In string literals, when a `\` occurs immediately before the newline, the `\`, - // the newline, and all whitespace at the beginning of the next line are ignored - match self.peek() { - Some('\n') | Some('\r') => { - self.skip_whitespace(); - Some(StringComponent::new( - TextRange::from_to(start, self.get_pos()), - StringComponentKind::IgnoreNewline, - )) - } - _ => None, - } - } - - pub fn parse_component(&mut self) -> Option { - let next = self.peek()?; - - // Ignore string close - if next == self.quote as char { - return None; - } - - let start = self.get_pos(); - self.advance(); - - if next == '\\' { - // Strings can use `\` to ignore newlines, so we first try to parse one of those - // before falling back to parsing char escapes - if self.quote == b'"' { - if let Some(component) = self.parse_ignore_newline(start) { - return Some(component); - } - } - - Some(self.parse_escape(start)) - } else { - let end = self.get_pos(); - Some(StringComponent::new(TextRange::from_to(start, end), CodePoint)) - } - } - - pub fn parse_suffix(&mut self) -> Option { - let start = self.get_pos(); - let _ = self.peek()?; - while let Some(_) = self.peek() { - self.advance(); - } - let end = self.get_pos(); - Some(TextRange::from_to(start, end)) - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct StringComponent { - pub range: TextRange, - pub kind: StringComponentKind, -} - -impl StringComponent { - fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { - StringComponent { range, kind } - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub enum StringComponentKind { - IgnoreNewline, - CodePoint, - AsciiEscape, - AsciiCodeEscape, - UnicodeEscape, -} diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs deleted file mode 100644 index a4742a0d1f8..00000000000 --- a/crates/ra_syntax/src/string_lexing/string.rs +++ /dev/null @@ -1,222 +0,0 @@ -use crate::{ - TextRange, - string_lexing::{ - parser::Parser, - StringComponent, -}}; - -pub fn parse_string_literal(src: &str) -> StringComponentIterator { - StringComponentIterator { - parser: Parser::new(src, b'"'), - has_closing_quote: false, - suffix: None, - prefix: None, - quote: b'"', - } -} - -pub fn parse_byte_string_literal(src: &str) -> StringComponentIterator { - StringComponentIterator { - parser: Parser::new(src, b'"'), - has_closing_quote: false, - suffix: None, - prefix: Some(b'b'), - quote: b'"', - } -} - -pub fn parse_char_literal(src: &str) -> StringComponentIterator { - StringComponentIterator { - parser: Parser::new(src, b'\''), - has_closing_quote: false, - suffix: None, - prefix: None, - quote: b'\'', - } -} - -pub fn parse_byte_literal(src: &str) -> StringComponentIterator { - StringComponentIterator { - parser: Parser::new(src, b'\''), - has_closing_quote: false, - suffix: None, - prefix: Some(b'b'), - quote: b'\'', - } -} - -pub struct StringComponentIterator<'a> { - parser: Parser<'a>, - pub has_closing_quote: bool, - pub suffix: Option, - prefix: Option, - quote: u8, -} - -impl<'a> Iterator for StringComponentIterator<'a> { - type Item = StringComponent; - fn next(&mut self) -> Option { - if self.parser.pos == 0 { - if let Some(prefix) = self.prefix { - assert!( - self.parser.advance() == prefix as char, - "literal should start with a {:?}", - prefix as char, - ); - } - assert!( - self.parser.advance() == self.quote as char, - "literal should start with a {:?}", - self.quote as char, - ); - } - - if let Some(component) = self.parser.parse_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.parser.peek() == Some(self.quote as char) { - self.parser.advance(); - self.has_closing_quote = true; - if let Some(range) = self.parser.parse_suffix() { - self.suffix = Some(range); - } - } - - assert!( - self.parser.peek() == None, - "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}", - self.parser.src, - self.parser.pos, - self.parser.src.len() - ); - - None - } -} - -#[cfg(test)] -mod tests { - use rowan::TextRange; - use crate::string_lexing::{ - StringComponent, - StringComponentKind::*, -}; - - fn parse(src: &str) -> (bool, Vec) { - let component_iterator = &mut super::parse_char_literal(src); - let components: Vec<_> = component_iterator.collect(); - (component_iterator.has_closing_quote, components) - } - - fn unclosed_char_component(src: &str) -> StringComponent { - let (has_closing_quote, components) = parse(src); - assert!(!has_closing_quote, "char should not have closing quote"); - assert!(components.len() == 1); - components[0].clone() - } - - fn closed_char_component(src: &str) -> StringComponent { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components); - components[0].clone() - } - - fn closed_char_components(src: &str) -> Vec { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - components - } - - fn range_closed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) - } - - fn range_unclosed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32).into()) - } - - #[test] - fn test_unicode_escapes() { - let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = closed_char_component(&escape_sequence); - let expected_range = range_closed(&escape_sequence); - assert_eq!(component.kind, UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_unicode_escapes_unclosed() { - let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = unclosed_char_component(&escape_sequence); - let expected_range = range_unclosed(&escape_sequence); - assert_eq!(component.kind, UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_empty_char() { - let (has_closing_quote, components) = parse("''"); - assert!(has_closing_quote, "char should have closing quote"); - assert!(components.len() == 0); - } - - #[test] - fn test_unclosed_char() { - let component = unclosed_char_component("'a"); - assert!(component.kind == CodePoint); - assert!(component.range == TextRange::from_to(1.into(), 2.into())); - } - - #[test] - fn test_digit_escapes() { - let literals = &[r"", r"5", r"55"]; - - for literal in literals { - let lit_text = format!(r"'\x{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == AsciiCodeEscape); - assert!(component.range == range_closed(&lit_text)); - } - - // More than 2 digits starts a new codepoint - let components = closed_char_components(r"'\x555'"); - assert!(components.len() == 2); - assert!(components[1].kind == CodePoint); - } - - #[test] - fn test_ascii_escapes() { - let literals = &[ - r"\'", "\\\"", // equivalent to \" - r"\n", r"\r", r"\t", r"\\", r"\0", - ]; - - for literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == AsciiEscape); - assert!(component.range == range_closed(&lit_text)); - } - } - - #[test] - fn test_no_escapes() { - let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; - - for &literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == CodePoint); - assert!(component.range == range_closed(&lit_text)); - } - } -} diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 290f80fc651..f653e65d033 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs @@ -12,7 +12,7 @@ use crate::{ pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec) { let literal_text = node.text(); let literal_range = node.range(); - let mut components = string_lexing::parse_byte_literal(literal_text); + let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text); let mut len = 0; for component in &mut components { len += 1; diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs index eae395e9d8b..1d48c2d9b16 100644 --- a/crates/ra_syntax/src/validation/byte_string.rs +++ b/crates/ra_syntax/src/validation/byte_string.rs @@ -10,7 +10,7 @@ use super::byte; pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec) { let literal_text = node.text(); let literal_range = node.range(); - let mut components = string_lexing::parse_byte_string_literal(literal_text); + let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text); for component in &mut components { let range = component.range + literal_range.start(); diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index a385accddf0..0f1885873b7 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs @@ -15,7 +15,7 @@ use crate::{ pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec) { let literal_text = node.text(); let literal_range = node.range(); - let mut components = string_lexing::parse_char_literal(literal_text); + let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text); let mut len = 0; for component in &mut components { len += 1; diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs index f7f5c02c07d..fc2f1b99212 100644 --- a/crates/ra_syntax/src/validation/string.rs +++ b/crates/ra_syntax/src/validation/string.rs @@ -10,7 +10,7 @@ use super::char; pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec) { let literal_text = node.text(); let literal_range = node.range(); - let mut components = string_lexing::parse_string_literal(literal_text); + let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text); for component in &mut components { let range = component.range + literal_range.start();