auto merge of #12086 : huonw/rust/safe-json, r=kballard

The lexer and json were using `transmute(-1): char` as a sentinel value for EOF, which is invalid since `char` is strictly a unicode codepoint.

Fixing this allows for range asserts on chars since they always lie between 0 and 0x10FFFF.
This commit is contained in:
bors 2014-02-08 00:26:30 -08:00
commit 95483e30a2
4 changed files with 298 additions and 253 deletions

View File

@ -229,7 +229,6 @@ fn main() {
*/
use std::char;
use std::cast::transmute;
use std::f64;
use std::hashmap::HashMap;
use std::io;
@ -718,7 +717,7 @@ impl Json {
pub struct Parser<T> {
priv rdr: T,
priv ch: char,
priv ch: Option<char>,
priv line: uint,
priv col: uint,
}
@ -728,7 +727,7 @@ impl<T: Iterator<char>> Parser<T> {
pub fn new(rdr: T) -> Parser<T> {
let mut p = Parser {
rdr: rdr,
ch: '\x00',
ch: Some('\x00'),
line: 1,
col: 0,
};
@ -756,16 +755,12 @@ impl<T: Iterator<char>> Parser<T> {
}
impl<T : Iterator<char>> Parser<T> {
// FIXME: #8971: unsound
fn eof(&self) -> bool { self.ch == unsafe { transmute(-1u32) } }
fn eof(&self) -> bool { self.ch.is_none() }
fn ch_or_null(&self) -> char { self.ch.unwrap_or('\x00') }
fn bump(&mut self) {
match self.rdr.next() {
Some(ch) => self.ch = ch,
None() => self.ch = unsafe { transmute(-1u32) }, // FIXME: #8971: unsound
}
self.ch = self.rdr.next();
if self.ch == '\n' {
if self.ch_is('\n') {
self.line += 1u;
self.col = 1u;
} else {
@ -773,10 +768,13 @@ impl<T : Iterator<char>> Parser<T> {
}
}
fn next_char(&mut self) -> char {
fn next_char(&mut self) -> Option<char> {
self.bump();
self.ch
}
fn ch_is(&self, c: char) -> bool {
self.ch == Some(c)
}
fn error<T>(&self, msg: ~str) -> Result<T, Error> {
Err(Error { line: self.line, col: self.col, msg: msg })
@ -787,31 +785,32 @@ impl<T : Iterator<char>> Parser<T> {
if self.eof() { return self.error(~"EOF while parsing value"); }
match self.ch {
'n' => self.parse_ident("ull", Null),
't' => self.parse_ident("rue", Boolean(true)),
'f' => self.parse_ident("alse", Boolean(false)),
'0' .. '9' | '-' => self.parse_number(),
'"' =>
match self.parse_str() {
Ok(s) => Ok(String(s)),
Err(e) => Err(e),
match self.ch_or_null() {
'n' => self.parse_ident("ull", Null),
't' => self.parse_ident("rue", Boolean(true)),
'f' => self.parse_ident("alse", Boolean(false)),
'0' .. '9' | '-' => self.parse_number(),
'"' => {
match self.parse_str() {
Ok(s) => Ok(String(s)),
Err(e) => Err(e),
}
},
'[' => self.parse_list(),
'{' => self.parse_object(),
_ => self.error(~"invalid syntax")
'[' => self.parse_list(),
'{' => self.parse_object(),
_ => self.error(~"invalid syntax"),
}
}
fn parse_whitespace(&mut self) {
while self.ch == ' ' ||
self.ch == '\n' ||
self.ch == '\t' ||
self.ch == '\r' { self.bump(); }
while self.ch_is(' ') ||
self.ch_is('\n') ||
self.ch_is('\t') ||
self.ch_is('\r') { self.bump(); }
}
fn parse_ident(&mut self, ident: &str, value: Json) -> Result<Json, Error> {
if ident.chars().all(|c| c == self.next_char()) {
if ident.chars().all(|c| Some(c) == self.next_char()) {
self.bump();
Ok(value)
} else {
@ -822,7 +821,7 @@ impl<T : Iterator<char>> Parser<T> {
fn parse_number(&mut self) -> Result<Json, Error> {
let mut neg = 1.0;
if self.ch == '-' {
if self.ch_is('-') {
self.bump();
neg = -1.0;
}
@ -832,14 +831,14 @@ impl<T : Iterator<char>> Parser<T> {
Err(e) => return Err(e)
};
if self.ch == '.' {
if self.ch_is('.') {
match self.parse_decimal(res) {
Ok(r) => res = r,
Err(e) => return Err(e)
}
}
if self.ch == 'e' || self.ch == 'E' {
if self.ch_is('e') || self.ch_is('E') {
match self.parse_exponent(res) {
Ok(r) => res = r,
Err(e) => return Err(e)
@ -852,32 +851,31 @@ impl<T : Iterator<char>> Parser<T> {
fn parse_integer(&mut self) -> Result<f64, Error> {
let mut res = 0.0;
match self.ch {
'0' => {
self.bump();
match self.ch_or_null() {
'0' => {
self.bump();
// There can be only one leading '0'.
match self.ch {
'0' .. '9' => return self.error(~"invalid number"),
_ => ()
}
}
'1' .. '9' => {
while !self.eof() {
match self.ch {
'0' .. '9' => {
res *= 10.0;
res += ((self.ch as int) - ('0' as int)) as f64;
// There can be only one leading '0'.
match self.ch_or_null() {
'0' .. '9' => return self.error(~"invalid number"),
_ => ()
}
},
'1' .. '9' => {
while !self.eof() {
match self.ch_or_null() {
c @ '0' .. '9' => {
res *= 10.0;
res += ((c as int) - ('0' as int)) as f64;
self.bump();
}
_ => break
self.bump();
}
_ => break,
}
}
}
}
_ => return self.error(~"invalid number")
_ => return self.error(~"invalid number"),
}
Ok(res)
}
@ -885,22 +883,22 @@ impl<T : Iterator<char>> Parser<T> {
self.bump();
// Make sure a digit follows the decimal place.
match self.ch {
'0' .. '9' => (),
_ => return self.error(~"invalid number")
match self.ch_or_null() {
'0' .. '9' => (),
_ => return self.error(~"invalid number")
}
let mut res = res;
let mut dec = 1.0;
while !self.eof() {
match self.ch {
'0' .. '9' => {
dec /= 10.0;
res += (((self.ch as int) - ('0' as int)) as f64) * dec;
match self.ch_or_null() {
c @ '0' .. '9' => {
dec /= 10.0;
res += (((c as int) - ('0' as int)) as f64) * dec;
self.bump();
}
_ => break
self.bump();
}
_ => break,
}
}
@ -913,27 +911,27 @@ impl<T : Iterator<char>> Parser<T> {
let mut exp = 0u;
let mut neg_exp = false;
match self.ch {
'+' => self.bump(),
'-' => { self.bump(); neg_exp = true; }
_ => ()
if self.ch_is('+') {
self.bump();
} else if self.ch_is('-') {
self.bump();
neg_exp = true;
}
// Make sure a digit follows the exponent place.
match self.ch {
'0' .. '9' => (),
_ => return self.error(~"invalid number")
match self.ch_or_null() {
'0' .. '9' => (),
_ => return self.error(~"invalid number")
}
while !self.eof() {
match self.ch {
'0' .. '9' => {
exp *= 10u;
exp += (self.ch as uint) - ('0' as uint);
match self.ch_or_null() {
c @ '0' .. '9' => {
exp *= 10;
exp += (c as uint) - ('0' as uint);
self.bump();
}
_ => break
self.bump();
}
_ => break
}
}
@ -958,56 +956,55 @@ impl<T : Iterator<char>> Parser<T> {
}
if escape {
match self.ch {
'"' => res.push_char('"'),
'\\' => res.push_char('\\'),
'/' => res.push_char('/'),
'b' => res.push_char('\x08'),
'f' => res.push_char('\x0c'),
'n' => res.push_char('\n'),
'r' => res.push_char('\r'),
't' => res.push_char('\t'),
'u' => {
// Parse \u1234.
let mut i = 0u;
let mut n = 0u;
while i < 4u {
match self.next_char() {
'0' .. '9' => {
n = n * 16u + (self.ch as uint)
- ('0' as uint);
},
'a' | 'A' => n = n * 16u + 10u,
'b' | 'B' => n = n * 16u + 11u,
'c' | 'C' => n = n * 16u + 12u,
'd' | 'D' => n = n * 16u + 13u,
'e' | 'E' => n = n * 16u + 14u,
'f' | 'F' => n = n * 16u + 15u,
_ => return self.error(
~"invalid \\u escape (unrecognized hex)")
}
i += 1u;
}
match self.ch_or_null() {
'"' => res.push_char('"'),
'\\' => res.push_char('\\'),
'/' => res.push_char('/'),
'b' => res.push_char('\x08'),
'f' => res.push_char('\x0c'),
'n' => res.push_char('\n'),
'r' => res.push_char('\r'),
't' => res.push_char('\t'),
'u' => {
// Parse \u1234.
let mut i = 0u;
let mut n = 0u;
while i < 4u && !self.eof() {
self.bump();
n = match self.ch_or_null() {
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
'a' | 'A' => n * 16u + 10u,
'b' | 'B' => n * 16u + 11u,
'c' | 'C' => n * 16u + 12u,
'd' | 'D' => n * 16u + 13u,
'e' | 'E' => n * 16u + 14u,
'f' | 'F' => n * 16u + 15u,
_ => return self.error(
~"invalid \\u escape (unrecognized hex)")
};
// Error out if we didn't parse 4 digits.
if i != 4u {
return self.error(
~"invalid \\u escape (not four digits)");
}
i += 1u;
}
res.push_char(char::from_u32(n as u32).unwrap());
}
_ => return self.error(~"invalid escape")
// Error out if we didn't parse 4 digits.
if i != 4u {
return self.error(
~"invalid \\u escape (not four digits)");
}
res.push_char(char::from_u32(n as u32).unwrap());
}
_ => return self.error(~"invalid escape"),
}
escape = false;
} else if self.ch == '\\' {
} else if self.ch_is('\\') {
escape = true;
} else {
if self.ch == '"' {
self.bump();
return Ok(res);
match self.ch {
Some('"') => { self.bump(); return Ok(res); },
Some(c) => res.push_char(c),
None => unreachable!()
}
res.push_char(self.ch);
}
}
}
@ -1018,7 +1015,7 @@ impl<T : Iterator<char>> Parser<T> {
let mut values = ~[];
if self.ch == ']' {
if self.ch_is(']') {
self.bump();
return Ok(List(values));
}
@ -1034,10 +1031,13 @@ impl<T : Iterator<char>> Parser<T> {
return self.error(~"EOF while parsing list");
}
match self.ch {
',' => self.bump(),
']' => { self.bump(); return Ok(List(values)); }
_ => return self.error(~"expected `,` or `]`")
if self.ch_is(',') {
self.bump();
} else if self.ch_is(']') {
self.bump();
return Ok(List(values));
} else {
return self.error(~"expected `,` or `]`")
}
};
}
@ -1048,7 +1048,7 @@ impl<T : Iterator<char>> Parser<T> {
let mut values = ~TreeMap::new();
if self.ch == '}' {
if self.ch_is('}') {
self.bump();
return Ok(Object(values));
}
@ -1056,7 +1056,7 @@ impl<T : Iterator<char>> Parser<T> {
while !self.eof() {
self.parse_whitespace();
if self.ch != '"' {
if !self.ch_is('"') {
return self.error(~"key must be a string");
}
@ -1067,7 +1067,7 @@ impl<T : Iterator<char>> Parser<T> {
self.parse_whitespace();
if self.ch != ':' {
if !self.ch_is(':') {
if self.eof() { break; }
return self.error(~"expected `:`");
}
@ -1079,13 +1079,13 @@ impl<T : Iterator<char>> Parser<T> {
}
self.parse_whitespace();
match self.ch {
',' => self.bump(),
'}' => { self.bump(); return Ok(Object(values)); }
_ => {
if self.eof() { break; }
return self.error(~"expected `,` or `}`");
}
match self.ch_or_null() {
',' => self.bump(),
'}' => { self.bump(); return Ok(Object(values)); },
_ => {
if self.eof() { break; }
return self.error(~"expected `,` or `}`");
}
}
}

View File

@ -545,7 +545,11 @@ fn load<'a>(bcx: &'a Block<'a>, llptr: ValueRef, ty: ty::t) -> ValueRef {
if type_is_zero_size(bcx.ccx(), ty) {
C_undef(type_of::type_of(bcx.ccx(), ty))
} else if ty::type_is_bool(ty) {
LoadRangeAssert(bcx, llptr, 0, 2, lib::llvm::True)
LoadRangeAssert(bcx, llptr, 0, 2, lib::llvm::False)
} else if ty::type_is_char(ty) {
// a char is a unicode codepoint, and so takes values from 0
// to 0x10FFFF inclusive only.
LoadRangeAssert(bcx, llptr, 0, 0x10FFFF + 1, lib::llvm::False)
} else {
Load(bcx, llptr)
}

View File

@ -12,7 +12,7 @@ use ast;
use codemap::{BytePos, CharPos, CodeMap, Pos};
use diagnostic;
use parse::lexer::{is_whitespace, with_str_from, Reader};
use parse::lexer::{StringReader, bump, is_eof, nextch, TokenAndSpan};
use parse::lexer::{StringReader, bump, is_eof, nextch_is, TokenAndSpan};
use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
use parse::lexer;
use parse::token;
@ -136,11 +136,11 @@ pub fn strip_doc_comment_decoration(comment: &str) -> ~str {
fn read_to_eol(rdr: &StringReader) -> ~str {
let mut val = ~"";
while rdr.curr.get() != '\n' && !is_eof(rdr) {
val.push_char(rdr.curr.get());
while !rdr.curr_is('\n') && !is_eof(rdr) {
val.push_char(rdr.curr.get().unwrap());
bump(rdr);
}
if rdr.curr.get() == '\n' { bump(rdr); }
if rdr.curr_is('\n') { bump(rdr); }
return val;
}
@ -152,7 +152,7 @@ fn read_one_line_comment(rdr: &StringReader) -> ~str {
}
fn consume_non_eol_whitespace(rdr: &StringReader) {
while is_whitespace(rdr.curr.get()) && rdr.curr.get() != '\n' &&
while is_whitespace(rdr.curr.get()) && !rdr.curr_is('\n') &&
!is_eof(rdr) {
bump(rdr);
}
@ -171,7 +171,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut ~[Comment]) {
fn consume_whitespace_counting_blank_lines(rdr: &StringReader,
comments: &mut ~[Comment]) {
while is_whitespace(rdr.curr.get()) && !is_eof(rdr) {
if rdr.col.get() == CharPos(0u) && rdr.curr.get() == '\n' {
if rdr.col.get() == CharPos(0u) && rdr.curr_is('\n') {
push_blank_line_comment(rdr, &mut *comments);
}
bump(rdr);
@ -196,7 +196,7 @@ fn read_line_comments(rdr: &StringReader, code_to_the_left: bool,
debug!(">>> line comments");
let p = rdr.last_pos.get();
let mut lines: ~[~str] = ~[];
while rdr.curr.get() == '/' && nextch(rdr) == '/' {
while rdr.curr_is('/') && nextch_is(rdr, '/') {
let line = read_one_line_comment(rdr);
debug!("{}", line);
if is_doc_comment(line) { // doc-comments are not put in comments
@ -261,9 +261,9 @@ fn read_block_comment(rdr: &StringReader,
let mut curr_line = ~"/*";
// doc-comments are not really comments, they are attributes
if rdr.curr.get() == '*' || rdr.curr.get() == '!' {
while !(rdr.curr.get() == '*' && nextch(rdr) == '/') && !is_eof(rdr) {
curr_line.push_char(rdr.curr.get());
if rdr.curr_is('*') || rdr.curr_is('!') {
while !(rdr.curr_is('*') && nextch_is(rdr, '/')) && !is_eof(rdr) {
curr_line.push_char(rdr.curr.get().unwrap());
bump(rdr);
}
if !is_eof(rdr) {
@ -281,20 +281,20 @@ fn read_block_comment(rdr: &StringReader,
if is_eof(rdr) {
rdr.fatal(~"unterminated block comment");
}
if rdr.curr.get() == '\n' {
if rdr.curr_is('\n') {
trim_whitespace_prefix_and_push_line(&mut lines, curr_line,
col);
curr_line = ~"";
bump(rdr);
} else {
curr_line.push_char(rdr.curr.get());
if rdr.curr.get() == '/' && nextch(rdr) == '*' {
curr_line.push_char(rdr.curr.get().unwrap());
if rdr.curr_is('/') && nextch_is(rdr, '*') {
bump(rdr);
bump(rdr);
curr_line.push_char('*');
level += 1;
} else {
if rdr.curr.get() == '*' && nextch(rdr) == '/' {
if rdr.curr_is('*') && nextch_is(rdr, '/') {
bump(rdr);
bump(rdr);
curr_line.push_char('/');
@ -310,7 +310,7 @@ fn read_block_comment(rdr: &StringReader,
let mut style = if code_to_the_left { Trailing } else { Isolated };
consume_non_eol_whitespace(rdr);
if !is_eof(rdr) && rdr.curr.get() != '\n' && lines.len() == 1u {
if !is_eof(rdr) && !rdr.curr_is('\n') && lines.len() == 1u {
style = Mixed;
}
debug!("<<< block comment");
@ -318,20 +318,20 @@ fn read_block_comment(rdr: &StringReader,
}
fn peeking_at_comment(rdr: &StringReader) -> bool {
return ((rdr.curr.get() == '/' && nextch(rdr) == '/') ||
(rdr.curr.get() == '/' && nextch(rdr) == '*')) ||
(rdr.curr.get() == '#' && nextch(rdr) == '!');
return (rdr.curr_is('/') && nextch_is(rdr, '/')) ||
(rdr.curr_is('/') && nextch_is(rdr, '*')) ||
(rdr.curr_is('#') && nextch_is(rdr, '!'));
}
fn consume_comment(rdr: &StringReader,
code_to_the_left: bool,
comments: &mut ~[Comment]) {
debug!(">>> consume comment");
if rdr.curr.get() == '/' && nextch(rdr) == '/' {
if rdr.curr_is('/') && nextch_is(rdr, '/') {
read_line_comments(rdr, code_to_the_left, comments);
} else if rdr.curr.get() == '/' && nextch(rdr) == '*' {
} else if rdr.curr_is('/') && nextch_is(rdr, '*') {
read_block_comment(rdr, code_to_the_left, comments);
} else if rdr.curr.get() == '#' && nextch(rdr) == '!' {
} else if rdr.curr_is('#') && nextch_is(rdr, '!') {
read_shebang_comment(rdr, code_to_the_left, comments);
} else { fail!(); }
debug!("<<< consume comment");
@ -363,7 +363,7 @@ pub fn gather_comments_and_literals(span_diagnostic:
loop {
let mut code_to_the_left = !first_read;
consume_non_eol_whitespace(&rdr);
if rdr.curr.get() == '\n' {
if rdr.curr_is('\n') {
code_to_the_left = false;
consume_whitespace_counting_blank_lines(&rdr, &mut comments);
}

View File

@ -16,7 +16,6 @@ use ext::tt::transcribe::{dup_tt_reader, tt_next_token};
use parse::token;
use parse::token::{str_to_ident};
use std::cast::transmute;
use std::cell::{Cell, RefCell};
use std::char;
use std::num::from_str_radix;
@ -48,13 +47,19 @@ pub struct StringReader {
// The column of the next character to read
col: Cell<CharPos>,
// The last character to be read
curr: Cell<char>,
curr: Cell<Option<char>>,
filemap: @codemap::FileMap,
/* cached: */
peek_tok: RefCell<token::Token>,
peek_span: RefCell<Span>,
}
impl StringReader {
pub fn curr_is(&self, c: char) -> bool {
self.curr.get() == Some(c)
}
}
pub fn new_string_reader(span_diagnostic: @SpanHandler,
filemap: @codemap::FileMap)
-> StringReader {
@ -74,7 +79,7 @@ pub fn new_low_level_string_reader(span_diagnostic: @SpanHandler,
pos: Cell::new(filemap.start_pos),
last_pos: Cell::new(filemap.start_pos),
col: Cell::new(CharPos(0)),
curr: Cell::new(initial_char),
curr: Cell::new(Some(initial_char)),
filemap: filemap,
/* dummy values; not read */
peek_tok: RefCell::new(token::EOF),
@ -246,14 +251,12 @@ pub fn bump(rdr: &StringReader) {
rdr.last_pos.set(rdr.pos.get());
let current_byte_offset = byte_offset(rdr, rdr.pos.get()).to_uint();
if current_byte_offset < (rdr.filemap.src).len() {
assert!(rdr.curr.get() != unsafe {
transmute(-1u32)
}); // FIXME: #8971: unsound
let last_char = rdr.curr.get();
assert!(rdr.curr.get().is_some());
let last_char = rdr.curr.get().unwrap();
let next = rdr.filemap.src.char_range_at(current_byte_offset);
let byte_offset_diff = next.next - current_byte_offset;
rdr.pos.set(rdr.pos.get() + Pos::from_uint(byte_offset_diff));
rdr.curr.set(next.ch);
rdr.curr.set(Some(next.ch));
rdr.col.set(rdr.col.get() + CharPos(1u));
if last_char == '\n' {
rdr.filemap.next_line(rdr.last_pos.get());
@ -265,37 +268,50 @@ pub fn bump(rdr: &StringReader) {
Pos::from_uint(current_byte_offset), byte_offset_diff);
}
} else {
rdr.curr.set(unsafe { transmute(-1u32) }); // FIXME: #8971: unsound
rdr.curr.set(None);
}
}
pub fn is_eof(rdr: &StringReader) -> bool {
rdr.curr.get() == unsafe { transmute(-1u32) } // FIXME: #8971: unsound
rdr.curr.get().is_none()
}
pub fn nextch(rdr: &StringReader) -> char {
pub fn nextch(rdr: &StringReader) -> Option<char> {
let offset = byte_offset(rdr, rdr.pos.get()).to_uint();
if offset < (rdr.filemap.src).len() {
return rdr.filemap.src.char_at(offset);
} else { return unsafe { transmute(-1u32) }; } // FIXME: #8971: unsound
Some(rdr.filemap.src.char_at(offset))
} else {
None
}
}
pub fn nextch_is(rdr: &StringReader, c: char) -> bool {
nextch(rdr) == Some(c)
}
fn hex_digit_val(c: char) -> int {
if in_range(c, '0', '9') { return (c as int) - ('0' as int); }
if in_range(c, 'a', 'f') { return (c as int) - ('a' as int) + 10; }
if in_range(c, 'A', 'F') { return (c as int) - ('A' as int) + 10; }
fn hex_digit_val(c: Option<char>) -> int {
let d = c.unwrap_or('\x00');
if in_range(c, '0', '9') { return (d as int) - ('0' as int); }
if in_range(c, 'a', 'f') { return (d as int) - ('a' as int) + 10; }
if in_range(c, 'A', 'F') { return (d as int) - ('A' as int) + 10; }
fail!();
}
pub fn is_whitespace(c: char) -> bool {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
pub fn is_whitespace(c: Option<char>) -> bool {
match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace
' ' | '\n' | '\t' | '\r' => true,
_ => false
}
}
fn in_range(c: char, lo: char, hi: char) -> bool {
return lo <= c && c <= hi
fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
match c {
Some(c) => lo <= c && c <= hi,
_ => false
}
}
fn is_dec_digit(c: char) -> bool { return in_range(c, '0', '9'); }
fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }
fn is_hex_digit(c: char) -> bool {
fn is_hex_digit(c: Option<char>) -> bool {
return in_range(c, '0', '9') || in_range(c, 'a', 'f') ||
in_range(c, 'A', 'F');
}
@ -317,15 +333,15 @@ pub fn is_line_non_doc_comment(s: &str) -> bool {
// returns a Some(sugared-doc-attr) if one exists, None otherwise
fn consume_any_line_comment(rdr: &StringReader)
-> Option<TokenAndSpan> {
if rdr.curr.get() == '/' {
if rdr.curr_is('/') {
match nextch(rdr) {
'/' => {
Some('/') => {
bump(rdr);
bump(rdr);
// line comments starting with "///" or "//!" are doc-comments
if rdr.curr.get() == '/' || rdr.curr.get() == '!' {
if rdr.curr_is('/') || rdr.curr_is('!') {
let start_bpos = rdr.pos.get() - BytePos(3);
while rdr.curr.get() != '\n' && !is_eof(rdr) {
while !rdr.curr_is('\n') && !is_eof(rdr) {
bump(rdr);
}
let ret = with_str_from(rdr, start_bpos, |string| {
@ -344,16 +360,16 @@ fn consume_any_line_comment(rdr: &StringReader)
return ret;
}
} else {
while rdr.curr.get() != '\n' && !is_eof(rdr) { bump(rdr); }
while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); }
}
// Restart whitespace munch.
return consume_whitespace_and_comments(rdr);
}
'*' => { bump(rdr); bump(rdr); return consume_block_comment(rdr); }
Some('*') => { bump(rdr); bump(rdr); return consume_block_comment(rdr); }
_ => ()
}
} else if rdr.curr.get() == '#' {
if nextch(rdr) == '!' {
} else if rdr.curr_is('#') {
if nextch_is(rdr, '!') {
// I guess this is the only way to figure out if
// we're at the beginning of the file...
let cmap = @CodeMap::new();
@ -363,7 +379,7 @@ fn consume_any_line_comment(rdr: &StringReader)
}
let loc = cmap.lookup_char_pos_adj(rdr.last_pos.get());
if loc.line == 1u && loc.col == CharPos(0u) {
while rdr.curr.get() != '\n' && !is_eof(rdr) { bump(rdr); }
while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); }
return consume_whitespace_and_comments(rdr);
}
}
@ -378,7 +394,7 @@ pub fn is_block_non_doc_comment(s: &str) -> bool {
// might return a sugared-doc-attr
fn consume_block_comment(rdr: &StringReader) -> Option<TokenAndSpan> {
// block comments starting with "/**" or "/*!" are doc-comments
let is_doc_comment = rdr.curr.get() == '*' || rdr.curr.get() == '!';
let is_doc_comment = rdr.curr_is('*') || rdr.curr_is('!');
let start_bpos = rdr.pos.get() - BytePos(if is_doc_comment {3} else {2});
let mut level: int = 1;
@ -390,11 +406,11 @@ fn consume_block_comment(rdr: &StringReader) -> Option<TokenAndSpan> {
~"unterminated block comment"
};
fatal_span(rdr, start_bpos, rdr.last_pos.get(), msg);
} else if rdr.curr.get() == '/' && nextch(rdr) == '*' {
} else if rdr.curr_is('/') && nextch_is(rdr, '*') {
level += 1;
bump(rdr);
bump(rdr);
} else if rdr.curr.get() == '*' && nextch(rdr) == '/' {
} else if rdr.curr_is('*') && nextch_is(rdr, '/') {
level -= 1;
bump(rdr);
bump(rdr);
@ -424,12 +440,13 @@ fn consume_block_comment(rdr: &StringReader) -> Option<TokenAndSpan> {
}
fn scan_exponent(rdr: &StringReader, start_bpos: BytePos) -> Option<~str> {
let mut c = rdr.curr.get();
// \x00 hits the `return None` case immediately, so this is fine.
let mut c = rdr.curr.get().unwrap_or('\x00');
let mut rslt = ~"";
if c == 'e' || c == 'E' {
rslt.push_char(c);
bump(rdr);
c = rdr.curr.get();
c = rdr.curr.get().unwrap_or('\x00');
if c == '-' || c == '+' {
rslt.push_char(c);
bump(rdr);
@ -448,10 +465,10 @@ fn scan_digits(rdr: &StringReader, radix: uint) -> ~str {
let mut rslt = ~"";
loop {
let c = rdr.curr.get();
if c == '_' { bump(rdr); continue; }
match char::to_digit(c, radix) {
if c == Some('_') { bump(rdr); continue; }
match c.and_then(|cc| char::to_digit(cc, radix)) {
Some(_) => {
rslt.push_char(c);
rslt.push_char(c.unwrap());
bump(rdr);
}
_ => return rslt
@ -476,7 +493,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token {
let mut num_str;
let mut base = 10u;
let mut c = c;
let mut n = nextch(rdr);
let mut n = nextch(rdr).unwrap_or('\x00');
let start_bpos = rdr.last_pos.get();
if c == '0' && n == 'x' {
bump(rdr);
@ -492,7 +509,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token {
base = 2u;
}
num_str = scan_digits(rdr, base);
c = rdr.curr.get();
c = rdr.curr.get().unwrap_or('\x00');
nextch(rdr);
if c == 'u' || c == 'i' {
enum Result { Signed(ast::IntTy), Unsigned(ast::UintTy) }
@ -502,13 +519,13 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token {
else { Unsigned(ast::TyU) }
};
bump(rdr);
c = rdr.curr.get();
c = rdr.curr.get().unwrap_or('\x00');
if c == '8' {
bump(rdr);
tp = if signed { Signed(ast::TyI8) }
else { Unsigned(ast::TyU8) };
}
n = nextch(rdr);
n = nextch(rdr).unwrap_or('\x00');
if c == '1' && n == '6' {
bump(rdr);
bump(rdr);
@ -541,8 +558,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token {
}
}
let mut is_float = false;
if rdr.curr.get() == '.' && !(ident_start(nextch(rdr)) || nextch(rdr) ==
'.') {
if rdr.curr_is('.') && !(ident_start(nextch(rdr)) || nextch_is(rdr, '.')) {
is_float = true;
bump(rdr);
let dec_part = scan_digits(rdr, 10u);
@ -557,10 +573,10 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token {
None => ()
}
if rdr.curr.get() == 'f' {
if rdr.curr_is('f') {
bump(rdr);
c = rdr.curr.get();
n = nextch(rdr);
c = rdr.curr.get().unwrap_or('\x00');
n = nextch(rdr).unwrap_or('\x00');
if c == '3' && n == '2' {
bump(rdr);
bump(rdr);
@ -602,18 +618,23 @@ fn scan_numeric_escape(rdr: &StringReader, n_hex_digits: uint) -> char {
let mut accum_int = 0;
let mut i = n_hex_digits;
let start_bpos = rdr.last_pos.get();
while i != 0u {
while i != 0u && !is_eof(rdr) {
let n = rdr.curr.get();
if !is_hex_digit(n) {
fatal_span_char(rdr, rdr.last_pos.get(), rdr.pos.get(),
~"illegal character in numeric character escape",
n);
n.unwrap());
}
bump(rdr);
accum_int *= 16;
accum_int += hex_digit_val(n);
i -= 1u;
}
if i != 0 && is_eof(rdr) {
fatal_span(rdr, start_bpos, rdr.last_pos.get(),
~"unterminated numeric character escape");
}
match char::from_u32(accum_int as u32) {
Some(x) => x,
None => fatal_span(rdr, start_bpos, rdr.last_pos.get(),
@ -621,14 +642,18 @@ fn scan_numeric_escape(rdr: &StringReader, n_hex_digits: uint) -> char {
}
}
fn ident_start(c: char) -> bool {
fn ident_start(c: Option<char>) -> bool {
let c = match c { Some(c) => c, None => return false };
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && char::is_XID_start(c))
}
fn ident_continue(c: char) -> bool {
fn ident_continue(c: Option<char>) -> bool {
let c = match c { Some(c) => c, None => return false };
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
@ -641,7 +666,7 @@ fn ident_continue(c: char) -> bool {
// EFFECT: updates the interner
fn next_token_inner(rdr: &StringReader) -> token::Token {
let c = rdr.curr.get();
if ident_start(c) && nextch(rdr) != '"' && nextch(rdr) != '#' {
if ident_start(c) && !nextch_is(rdr, '"') && !nextch_is(rdr, '#') {
// Note: r as in r" or r#" is part of a raw string literal,
// not an identifier, and is handled further down.
@ -654,7 +679,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
if string == "_" {
token::UNDERSCORE
} else {
let is_mod_name = rdr.curr.get() == ':' && nextch(rdr) == ':';
let is_mod_name = rdr.curr_is(':') && nextch_is(rdr, ':');
// FIXME: perform NFKC normalization here. (Issue #2253)
token::IDENT(str_to_ident(string), is_mod_name)
@ -662,16 +687,16 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
})
}
if is_dec_digit(c) {
return scan_number(c, rdr);
return scan_number(c.unwrap(), rdr);
}
fn binop(rdr: &StringReader, op: token::BinOp) -> token::Token {
bump(rdr);
if rdr.curr.get() == '=' {
if rdr.curr_is('=') {
bump(rdr);
return token::BINOPEQ(op);
} else { return token::BINOP(op); }
}
match c {
match c.expect("next_token_inner called at EOF") {
@ -682,9 +707,9 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
',' => { bump(rdr); return token::COMMA; }
'.' => {
bump(rdr);
return if rdr.curr.get() == '.' {
return if rdr.curr_is('.') {
bump(rdr);
if rdr.curr.get() == '.' {
if rdr.curr_is('.') {
bump(rdr);
token::DOTDOTDOT
} else {
@ -705,7 +730,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
'~' => { bump(rdr); return token::TILDE; }
':' => {
bump(rdr);
if rdr.curr.get() == ':' {
if rdr.curr_is(':') {
bump(rdr);
return token::MOD_SEP;
} else { return token::COLON; }
@ -720,10 +745,10 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
// Multi-byte tokens.
'=' => {
bump(rdr);
if rdr.curr.get() == '=' {
if rdr.curr_is('=') {
bump(rdr);
return token::EQEQ;
} else if rdr.curr.get() == '>' {
} else if rdr.curr_is('>') {
bump(rdr);
return token::FAT_ARROW;
} else {
@ -732,19 +757,19 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
}
'!' => {
bump(rdr);
if rdr.curr.get() == '=' {
if rdr.curr_is('=') {
bump(rdr);
return token::NE;
} else { return token::NOT; }
}
'<' => {
bump(rdr);
match rdr.curr.get() {
match rdr.curr.get().unwrap_or('\x00') {
'=' => { bump(rdr); return token::LE; }
'<' => { return binop(rdr, token::SHL); }
'-' => {
bump(rdr);
match rdr.curr.get() {
match rdr.curr.get().unwrap_or('\x00') {
'>' => { bump(rdr); return token::DARROW; }
_ => { return token::LARROW; }
}
@ -754,7 +779,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
}
'>' => {
bump(rdr);
match rdr.curr.get() {
match rdr.curr.get().unwrap_or('\x00') {
'=' => { bump(rdr); return token::GE; }
'>' => { return binop(rdr, token::SHR); }
_ => { return token::GT; }
@ -764,12 +789,14 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
// Either a character constant 'a' OR a lifetime name 'abc
bump(rdr);
let start = rdr.last_pos.get();
let mut c2 = rdr.curr.get();
// the eof will be picked up by the final `'` check below
let mut c2 = rdr.curr.get().unwrap_or('\x00');
bump(rdr);
// If the character is an ident start not followed by another single
// quote, then this is a lifetime name:
if ident_start(c2) && rdr.curr.get() != '\'' {
if ident_start(Some(c2)) && !rdr.curr_is('\'') {
while ident_continue(rdr.curr.get()) {
bump(rdr);
}
@ -798,19 +825,24 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
let escaped_pos = rdr.last_pos.get();
bump(rdr);
match escaped {
'n' => { c2 = '\n'; }
'r' => { c2 = '\r'; }
't' => { c2 = '\t'; }
'\\' => { c2 = '\\'; }
'\'' => { c2 = '\''; }
'"' => { c2 = '"'; }
'0' => { c2 = '\x00'; }
'x' => { c2 = scan_numeric_escape(rdr, 2u); }
'u' => { c2 = scan_numeric_escape(rdr, 4u); }
'U' => { c2 = scan_numeric_escape(rdr, 8u); }
c2 => {
fatal_span_char(rdr, escaped_pos, rdr.last_pos.get(),
~"unknown character escape", c2);
None => {}
Some(e) => {
c2 = match e {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'"' => '"',
'0' => '\x00',
'x' => scan_numeric_escape(rdr, 2u),
'u' => scan_numeric_escape(rdr, 4u),
'U' => scan_numeric_escape(rdr, 8u),
c2 => {
fatal_span_char(rdr, escaped_pos, rdr.last_pos.get(),
~"unknown character escape", c2)
}
}
}
}
}
@ -820,7 +852,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
}
_ => {}
}
if rdr.curr.get() != '\'' {
if !rdr.curr_is('\'') {
fatal_span_verbose(rdr,
// Byte offsetting here is okay because the
// character before position `start` is an
@ -836,17 +868,22 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
let mut accum_str = ~"";
let start_bpos = rdr.last_pos.get();
bump(rdr);
while rdr.curr.get() != '"' {
while !rdr.curr_is('"') {
if is_eof(rdr) {
fatal_span(rdr, start_bpos, rdr.last_pos.get(),
~"unterminated double quote string");
}
let ch = rdr.curr.get();
let ch = rdr.curr.get().unwrap();
bump(rdr);
match ch {
'\\' => {
let escaped = rdr.curr.get();
if is_eof(rdr) {
fatal_span(rdr, start_bpos, rdr.last_pos.get(),
~"unterminated double quote string");
}
let escaped = rdr.curr.get().unwrap();
let escaped_pos = rdr.last_pos.get();
bump(rdr);
match escaped {
@ -883,15 +920,19 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
let start_bpos = rdr.last_pos.get();
bump(rdr);
let mut hash_count = 0u;
while rdr.curr.get() == '#' {
while rdr.curr_is('#') {
bump(rdr);
hash_count += 1;
}
if rdr.curr.get() != '"' {
if is_eof(rdr) {
fatal_span(rdr, start_bpos, rdr.last_pos.get(),
~"unterminated raw string");
} else if !rdr.curr_is('"') {
fatal_span_char(rdr, start_bpos, rdr.last_pos.get(),
~"only `#` is allowed in raw string delimitation; \
found illegal character",
rdr.curr.get());
rdr.curr.get().unwrap());
}
bump(rdr);
let content_start_bpos = rdr.last_pos.get();
@ -901,11 +942,11 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
fatal_span(rdr, start_bpos, rdr.last_pos.get(),
~"unterminated raw string");
}
if rdr.curr.get() == '"' {
if rdr.curr_is('"') {
content_end_bpos = rdr.last_pos.get();
for _ in range(0, hash_count) {
bump(rdr);
if rdr.curr.get() != '#' {
if !rdr.curr_is('#') {
continue 'outer;
}
}
@ -921,14 +962,14 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
return token::LIT_STR_RAW(str_content, hash_count);
}
'-' => {
if nextch(rdr) == '>' {
if nextch_is(rdr, '>') {
bump(rdr);
bump(rdr);
return token::RARROW;
} else { return binop(rdr, token::MINUS); }
}
'&' => {
if nextch(rdr) == '&' {
if nextch_is(rdr, '&') {
bump(rdr);
bump(rdr);
return token::ANDAND;
@ -936,7 +977,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token {
}
'|' => {
match nextch(rdr) {
'|' => { bump(rdr); bump(rdr); return token::OROR; }
Some('|') => { bump(rdr); bump(rdr); return token::OROR; }
_ => { return binop(rdr, token::OR); }
}
}