mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-28 01:34:21 +00:00
Auto merge of #62948 - matklad:failable-file-loading, r=petrochenkov
Normalize newlines when loading files Fixes #62865
This commit is contained in:
commit
ef1ecbefb8
@ -352,7 +352,6 @@ impl Cursor<'_> {
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
'\n' => break,
|
||||
'\r' if self.nth_char(1) == '\n' => break,
|
||||
EOF_CHAR if self.is_eof() => break,
|
||||
_ => {
|
||||
self.bump();
|
||||
@ -525,7 +524,6 @@ impl Cursor<'_> {
|
||||
match self.nth_char(0) {
|
||||
'/' if !first => break,
|
||||
'\n' if self.nth_char(1) != '\'' => break,
|
||||
'\r' if self.nth_char(1) == '\n' => break,
|
||||
EOF_CHAR if self.is_eof() => break,
|
||||
'\'' => {
|
||||
self.bump();
|
||||
|
@ -128,11 +128,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
|
||||
if first_char != '\\' {
|
||||
return match first_char {
|
||||
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
|
||||
'\r' => Err(if chars.clone().next() == Some('\n') {
|
||||
EscapeError::EscapeOnlyChar
|
||||
} else {
|
||||
EscapeError::BareCarriageReturn
|
||||
}),
|
||||
'\r' => Err(EscapeError::BareCarriageReturn),
|
||||
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
|
||||
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
|
||||
_ => {
|
||||
@ -244,27 +240,15 @@ where
|
||||
|
||||
let unescaped_char = match first_char {
|
||||
'\\' => {
|
||||
let (second_char, third_char) = {
|
||||
let mut chars = chars.clone();
|
||||
(chars.next(), chars.next())
|
||||
};
|
||||
match (second_char, third_char) {
|
||||
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
|
||||
let second_char = chars.clone().next();
|
||||
match second_char {
|
||||
Some('\n') => {
|
||||
skip_ascii_whitespace(&mut chars);
|
||||
continue;
|
||||
}
|
||||
_ => scan_escape(first_char, &mut chars, mode),
|
||||
}
|
||||
}
|
||||
'\r' => {
|
||||
let second_char = chars.clone().next();
|
||||
if second_char == Some('\n') {
|
||||
chars.next();
|
||||
Ok('\n')
|
||||
} else {
|
||||
scan_escape(first_char, &mut chars, mode)
|
||||
}
|
||||
}
|
||||
'\n' => Ok('\n'),
|
||||
'\t' => Ok('\t'),
|
||||
_ => scan_escape(first_char, &mut chars, mode),
|
||||
@ -298,15 +282,11 @@ where
|
||||
while let Some(curr) = chars.next() {
|
||||
let start = initial_len - chars.as_str().len() - curr.len_utf8();
|
||||
|
||||
let result = match (curr, chars.clone().next()) {
|
||||
('\r', Some('\n')) => {
|
||||
chars.next();
|
||||
Ok('\n')
|
||||
},
|
||||
('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
|
||||
(c, _) if mode.is_bytes() && !c.is_ascii() =>
|
||||
let result = match curr {
|
||||
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
|
||||
c if mode.is_bytes() && !c.is_ascii() =>
|
||||
Err(EscapeError::NonAsciiCharInByteString),
|
||||
(c, _) => Ok(c),
|
||||
c => Ok(c),
|
||||
};
|
||||
let end = initial_len - chars.as_str().len();
|
||||
|
||||
|
@ -11,7 +11,6 @@ fn test_unescape_char_bad() {
|
||||
check(r"\", EscapeError::LoneSlash);
|
||||
|
||||
check("\n", EscapeError::EscapeOnlyChar);
|
||||
check("\r\n", EscapeError::EscapeOnlyChar);
|
||||
check("\t", EscapeError::EscapeOnlyChar);
|
||||
check("'", EscapeError::EscapeOnlyChar);
|
||||
check("\r", EscapeError::BareCarriageReturn);
|
||||
@ -31,6 +30,7 @@ fn test_unescape_char_bad() {
|
||||
check(r"\v", EscapeError::InvalidEscape);
|
||||
check(r"\💩", EscapeError::InvalidEscape);
|
||||
check(r"\●", EscapeError::InvalidEscape);
|
||||
check("\\\r", EscapeError::InvalidEscape);
|
||||
|
||||
check(r"\x", EscapeError::TooShortHexEscape);
|
||||
check(r"\x0", EscapeError::TooShortHexEscape);
|
||||
@ -116,10 +116,9 @@ fn test_unescape_str_good() {
|
||||
|
||||
check("foo", "foo");
|
||||
check("", "");
|
||||
check(" \t\n\r\n", " \t\n\n");
|
||||
check(" \t\n", " \t\n");
|
||||
|
||||
check("hello \\\n world", "hello world");
|
||||
check("hello \\\r\n world", "hello world");
|
||||
check("thread's", "thread's")
|
||||
}
|
||||
|
||||
@ -134,7 +133,6 @@ fn test_unescape_byte_bad() {
|
||||
check(r"\", EscapeError::LoneSlash);
|
||||
|
||||
check("\n", EscapeError::EscapeOnlyChar);
|
||||
check("\r\n", EscapeError::EscapeOnlyChar);
|
||||
check("\t", EscapeError::EscapeOnlyChar);
|
||||
check("'", EscapeError::EscapeOnlyChar);
|
||||
check("\r", EscapeError::BareCarriageReturn);
|
||||
@ -238,10 +236,9 @@ fn test_unescape_byte_str_good() {
|
||||
|
||||
check("foo", b"foo");
|
||||
check("", b"");
|
||||
check(" \t\n\r\n", b" \t\n\n");
|
||||
check(" \t\n", b" \t\n");
|
||||
|
||||
check("hello \\\n world", b"hello world");
|
||||
check("hello \\\r\n world", b"hello world");
|
||||
check("thread's", b"thread's")
|
||||
}
|
||||
|
||||
@ -253,7 +250,6 @@ fn test_unescape_raw_str() {
|
||||
assert_eq!(unescaped, expected);
|
||||
}
|
||||
|
||||
check("\r\n", &[(0..2, Ok('\n'))]);
|
||||
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
|
||||
check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
|
||||
}
|
||||
@ -266,7 +262,6 @@ fn test_unescape_raw_byte_str() {
|
||||
assert_eq!(unescaped, expected);
|
||||
}
|
||||
|
||||
check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
|
||||
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
|
||||
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
|
||||
check(
|
||||
|
@ -8,9 +8,7 @@ use syntax_pos::{BytePos, Pos, Span};
|
||||
use rustc_lexer::Base;
|
||||
use rustc_lexer::unescape;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::char;
|
||||
use std::iter;
|
||||
use std::convert::TryInto;
|
||||
use rustc_data_structures::sync::Lrc;
|
||||
use log::debug;
|
||||
@ -181,18 +179,7 @@ impl<'a> StringReader<'a> {
|
||||
let string = self.str_from(start);
|
||||
// comments with only more "/"s are not doc comments
|
||||
let tok = if is_doc_comment(string) {
|
||||
let mut idx = 0;
|
||||
loop {
|
||||
idx = match string[idx..].find('\r') {
|
||||
None => break,
|
||||
Some(it) => idx + it + 1
|
||||
};
|
||||
if string[idx..].chars().next() != Some('\n') {
|
||||
self.err_span_(start + BytePos(idx as u32 - 1),
|
||||
start + BytePos(idx as u32),
|
||||
"bare CR not allowed in doc-comment");
|
||||
}
|
||||
}
|
||||
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
|
||||
token::DocComment(Symbol::intern(string))
|
||||
} else {
|
||||
token::Comment
|
||||
@ -217,15 +204,10 @@ impl<'a> StringReader<'a> {
|
||||
}
|
||||
|
||||
let tok = if is_doc_comment {
|
||||
let has_cr = string.contains('\r');
|
||||
let string = if has_cr {
|
||||
self.translate_crlf(start,
|
||||
string,
|
||||
"bare CR not allowed in block doc-comment")
|
||||
} else {
|
||||
string.into()
|
||||
};
|
||||
token::DocComment(Symbol::intern(&string[..]))
|
||||
self.forbid_bare_cr(start,
|
||||
string,
|
||||
"bare CR not allowed in block doc-comment");
|
||||
token::DocComment(Symbol::intern(string))
|
||||
} else {
|
||||
token::Comment
|
||||
};
|
||||
@ -516,49 +498,16 @@ impl<'a> StringReader<'a> {
|
||||
&self.src[self.src_index(start)..self.src_index(end)]
|
||||
}
|
||||
|
||||
/// Converts CRLF to LF in the given string, raising an error on bare CR.
|
||||
fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
|
||||
let mut chars = s.char_indices().peekable();
|
||||
while let Some((i, ch)) = chars.next() {
|
||||
if ch == '\r' {
|
||||
if let Some((lf_idx, '\n')) = chars.peek() {
|
||||
return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
|
||||
}
|
||||
let pos = start + BytePos(i as u32);
|
||||
let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
|
||||
self.err_span_(pos, end_pos, errmsg);
|
||||
}
|
||||
}
|
||||
return s.into();
|
||||
|
||||
fn translate_crlf_(rdr: &StringReader<'_>,
|
||||
start: BytePos,
|
||||
s: &str,
|
||||
mut j: usize,
|
||||
mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
|
||||
errmsg: &str)
|
||||
-> String {
|
||||
let mut buf = String::with_capacity(s.len());
|
||||
// Skip first CR
|
||||
buf.push_str(&s[.. j - 1]);
|
||||
while let Some((i, ch)) = chars.next() {
|
||||
if ch == '\r' {
|
||||
if j < i {
|
||||
buf.push_str(&s[j..i]);
|
||||
}
|
||||
let next = i + ch.len_utf8();
|
||||
j = next;
|
||||
if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
|
||||
let pos = start + BytePos(i as u32);
|
||||
let end_pos = start + BytePos(next as u32);
|
||||
rdr.err_span_(pos, end_pos, errmsg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if j < s.len() {
|
||||
buf.push_str(&s[j..]);
|
||||
}
|
||||
buf
|
||||
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
|
||||
let mut idx = 0;
|
||||
loop {
|
||||
idx = match s[idx..].find('\r') {
|
||||
None => break,
|
||||
Some(it) => idx + it + 1
|
||||
};
|
||||
self.err_span_(start + BytePos(idx as u32 - 1),
|
||||
start + BytePos(idx as u32),
|
||||
errmsg);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1043,6 +1043,7 @@ impl SourceFile {
|
||||
mut src: String,
|
||||
start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
|
||||
remove_bom(&mut src);
|
||||
normalize_newlines(&mut src);
|
||||
|
||||
let src_hash = {
|
||||
let mut hasher: StableHasher<u128> = StableHasher::new();
|
||||
@ -1210,6 +1211,61 @@ fn remove_bom(src: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Replaces `\r\n` with `\n` in-place in `src`.
|
||||
///
|
||||
/// Returns error if there's a lone `\r` in the string
|
||||
fn normalize_newlines(src: &mut String) {
|
||||
if !src.as_bytes().contains(&b'\r') {
|
||||
return;
|
||||
}
|
||||
|
||||
// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
|
||||
// While we *can* call `as_mut_vec` and do surgery on the live string
|
||||
// directly, let's rather steal the contents of `src`. This makes the code
|
||||
// safe even if a panic occurs.
|
||||
|
||||
let mut buf = std::mem::replace(src, String::new()).into_bytes();
|
||||
let mut gap_len = 0;
|
||||
let mut tail = buf.as_mut_slice();
|
||||
loop {
|
||||
let idx = match find_crlf(&tail[gap_len..]) {
|
||||
None => tail.len(),
|
||||
Some(idx) => idx + gap_len,
|
||||
};
|
||||
tail.copy_within(gap_len..idx, 0);
|
||||
tail = &mut tail[idx - gap_len..];
|
||||
if tail.len() == gap_len {
|
||||
break;
|
||||
}
|
||||
gap_len += 1;
|
||||
}
|
||||
|
||||
// Account for removed `\r`.
|
||||
// After `set_len`, `buf` is guaranteed to contain utf-8 again.
|
||||
let new_len = buf.len() - gap_len;
|
||||
unsafe {
|
||||
buf.set_len(new_len);
|
||||
*src = String::from_utf8_unchecked(buf);
|
||||
}
|
||||
|
||||
fn find_crlf(src: &[u8]) -> Option<usize> {
|
||||
let mut search_idx = 0;
|
||||
while let Some(idx) = find_cr(&src[search_idx..]) {
|
||||
if src[search_idx..].get(idx + 1) != Some(&b'\n') {
|
||||
search_idx += idx + 1;
|
||||
continue;
|
||||
}
|
||||
return Some(search_idx + idx);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn find_cr(src: &[u8]) -> Option<usize> {
|
||||
src.iter().position(|&b| b == b'\r')
|
||||
}
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
// Pos, BytePos, CharPos
|
||||
//
|
||||
|
@ -16,3 +16,23 @@ fn test_lookup_line() {
|
||||
assert_eq!(lookup_line(lines, BytePos(28)), 2);
|
||||
assert_eq!(lookup_line(lines, BytePos(29)), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_newlines() {
|
||||
fn check(before: &str, after: &str) {
|
||||
let mut actual = before.to_string();
|
||||
normalize_newlines(&mut actual);
|
||||
assert_eq!(actual.as_str(), after);
|
||||
}
|
||||
check("", "");
|
||||
check("\n", "\n");
|
||||
check("\r", "\r");
|
||||
check("\r\r", "\r\r");
|
||||
check("\r\n", "\n");
|
||||
check("hello world", "hello world");
|
||||
check("hello\nworld", "hello\nworld");
|
||||
check("hello\r\nworld", "hello\nworld");
|
||||
check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
|
||||
check("\r\r\n", "\r\n");
|
||||
check("hello\rworld", "hello\rworld");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user