diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs index 6c980f2f834..02afdb713ff 100644 --- a/src/libserialize/json.rs +++ b/src/libserialize/json.rs @@ -239,6 +239,7 @@ use std::io::MemWriter; use std::io; use std::num; use std::str; +use std::str::ScalarValue; use std::strbuf::StrBuf; use Encodable; @@ -1129,6 +1130,35 @@ impl> Parser { Ok(res) } + fn decode_hex_escape(&mut self) -> DecodeResult { + let mut i = 0u; + let mut n = 0u16; + while i < 4u && !self.eof() { + self.bump(); + n = match self.ch_or_null() { + c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)), + 'a' | 'A' => n * 16_u16 + 10_u16, + 'b' | 'B' => n * 16_u16 + 11_u16, + 'c' | 'C' => n * 16_u16 + 12_u16, + 'd' | 'D' => n * 16_u16 + 13_u16, + 'e' | 'E' => n * 16_u16 + 14_u16, + 'f' | 'F' => n * 16_u16 + 15_u16, + _ => return self.error( + ~"invalid \\u escape (unrecognized hex)") + }; + + i += 1u; + } + + // Error out if we didn't parse 4 digits. + if i != 4u { + return self.error( + ~"invalid \\u escape (not four digits)"); + } + + Ok(n) + } + fn parse_str(&mut self) -> DecodeResult<~str> { let mut escape = false; let mut res = StrBuf::new(); @@ -1149,35 +1179,35 @@ impl> Parser { 'n' => res.push_char('\n'), 'r' => res.push_char('\r'), 't' => res.push_char('\t'), - 'u' => { - // Parse \u1234. - let mut i = 0u; - let mut n = 0u; - while i < 4u && !self.eof() { - self.bump(); - n = match self.ch_or_null() { - c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint), - 'a' | 'A' => n * 16u + 10u, - 'b' | 'B' => n * 16u + 11u, - 'c' | 'C' => n * 16u + 12u, - 'd' | 'D' => n * 16u + 13u, - 'e' | 'E' => n * 16u + 14u, - 'f' | 'F' => n * 16u + 15u, + 'u' => match try!(self.decode_hex_escape()) { + 0xDC00 .. 0xDFFF => return self.error( + ~"lone trailing surrogate in hex escape"), + + // Non-BMP characters are encoded as a sequence of + // two hex escapes, representing UTF-16 surrogates. + n1 @ 0xD800 .. 0xDBFF => { + let c1 = self.next_char(); + let c2 = self.next_char(); + match (c1, c2) { + (Some('\\'), Some('u')) => (), _ => return self.error( - ~"invalid \\u escape (unrecognized hex)") - }; + ~"unexpected end of non-BMP hex escape"), + } - i += 1u; + let buf = [n1, try!(self.decode_hex_escape())]; + match str::utf16_items(buf.as_slice()).next() { + Some(ScalarValue(c)) => res.push_char(c), + _ => return self.error( + ~"lone leading surrogate in hex escape"), + } } - // Error out if we didn't parse 4 digits. - if i != 4u { - return self.error( - ~"invalid \\u escape (not four digits)"); - } - - res.push_char(char::from_u32(n as u32).unwrap()); - } + n => match char::from_u32(n as u32) { + Some(c) => res.push_char(c), + None => return self.error( + format!("invalid Unicode codepoint {:u}", n)), + }, + }, _ => return self.error(~"invalid escape"), } escape = false; @@ -2139,6 +2169,16 @@ mod tests { assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo"))); assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab"))); assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12"))); + + // Non-BMP escapes. The exact error messages and positions are kind of + // arbitrary. + assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9"))); + assert!(from_str("\"\\ud83d\"").is_err()); + assert!(from_str("\"\\udca9\"").is_err()); + assert!(from_str("\"\\ud83d\\ud83d\"").is_err()); + assert!(from_str("\"\\ud83dx\"").is_err()); + assert!(from_str("\"\\udca9\\udca9\"").is_err()); + assert!(from_str("\"\\udca9x\"").is_err()); } #[test] diff --git a/src/libstd/char.rs b/src/libstd/char.rs index 702dbcca8be..67c046986d3 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio #[cfg(test)] use str::Str; #[cfg(test)] use strbuf::StrBuf; +#[cfg(test)] use slice::ImmutableVector; #[cfg(not(test))] use cmp::{Eq, Ord}; #[cfg(not(test))] use default::Default; @@ -560,11 +561,19 @@ pub trait Char { /// Encodes this character as UTF-8 into the provided byte buffer. /// - /// The buffer must be at least 4 bytes long or a runtime failure will + /// The buffer must be at least 4 bytes long or a runtime failure may /// occur. /// - /// This will then return the number of characters written to the slice. + /// This will then return the number of bytes written to the slice. fn encode_utf8(&self, dst: &mut [u8]) -> uint; + + /// Encodes this character as UTF-16 into the provided `u16` buffer. + /// + /// The buffer must be at least 2 elements long or a runtime failure may + /// occur. + /// + /// This will then return the number of `u16`s written to the slice. + fn encode_utf16(&self, dst: &mut [u16]) -> uint; } impl Char for char { @@ -602,7 +611,7 @@ impl Char for char { fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) } - fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint { + fn encode_utf8(&self, dst: &mut [u8]) -> uint { let code = *self as uint; if code < MAX_ONE_B { dst[0] = code as u8; @@ -624,6 +633,24 @@ impl Char for char { return 4; } } + + fn encode_utf16(&self, dst: &mut [u16]) -> uint { + let mut ch = *self as uint; + if (ch & 0xFFFF_u) == ch { + // The BMP falls through (assuming non-surrogate, as it + // should) + assert!(ch <= 0xD7FF_u || ch >= 0xE000_u); + dst[0] = ch as u16; + 1 + } else { + // Supplementary planes break into surrogates. + assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u); + ch -= 0x1_0000_u; + dst[0] = 0xD800_u16 | ((ch >> 10) as u16); + dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); + 2 + } + } } #[cfg(not(test))] @@ -788,3 +815,31 @@ fn test_to_str() { let s = 't'.to_str(); assert_eq!(s, ~"t"); } + +#[test] +fn test_encode_utf8() { + fn check(input: char, expect: &[u8]) { + let mut buf = [0u8, ..4]; + let n = input.encode_utf8(buf /* as mut slice! */); + assert_eq!(buf.slice_to(n), expect); + } + + check('x', [0x78]); + check('\u00e9', [0xc3, 0xa9]); + check('\ua66e', [0xea, 0x99, 0xae]); + check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]); +} + +#[test] +fn test_encode_utf16() { + fn check(input: char, expect: &[u16]) { + let mut buf = [0u16, ..2]; + let n = input.encode_utf16(buf /* as mut slice! */); + assert_eq!(buf.slice_to(n), expect); + } + + check('x', [0x0078]); + check('\u00e9', [0x00e9]); + check('\ua66e', [0xa66e]); + check('\U0001f4a9', [0xd83d, 0xdca9]); +} diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 904c19b40ed..449329ce63a 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str { fn to_utf16(&self) -> ~[u16] { let mut u = ~[]; for ch in self.chars() { - // Arithmetic with u32 literals is easier on the eyes than chars. - let mut ch = ch as u32; - - if (ch & 0xFFFF_u32) == ch { - // The BMP falls through (assuming non-surrogate, as it - // should) - assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32); - u.push(ch as u16) - } else { - // Supplementary planes break into surrogates. - assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32); - ch -= 0x1_0000_u32; - let w1 = 0xD800_u16 | ((ch >> 10) as u16); - let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); - u.push_all([w1, w2]) - } + let mut buf = [0u16, ..2]; + let n = ch.encode_utf16(buf /* as mut slice! */); + u.push_all(buf.slice_to(n)); } u }