auto merge of #13469 : kmcallister/rust/utf16, r=huonw

This fixes two separate issues related to character encoding.

* Add `encode_utf16` to the `Char` trait, analogous to `encode_utf8`.  `&str` already supports UTF-16 encoding but only with a heap allocation.  Also fix `encode_utf8` docs and add tests.

* Correctly decode non-BMP hex escapes in JSON (#13064).
This commit is contained in:
bors 2014-04-13 05:51:52 -07:00
commit 4c62ab109b
3 changed files with 126 additions and 44 deletions

View File

@ -239,6 +239,7 @@ use std::io::MemWriter;
use std::io; use std::io;
use std::num; use std::num;
use std::str; use std::str;
use std::str::ScalarValue;
use std::strbuf::StrBuf; use std::strbuf::StrBuf;
use Encodable; use Encodable;
@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
Ok(res) Ok(res)
} }
fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
let mut i = 0u;
let mut n = 0u16;
while i < 4u && !self.eof() {
self.bump();
n = match self.ch_or_null() {
c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
'a' | 'A' => n * 16_u16 + 10_u16,
'b' | 'B' => n * 16_u16 + 11_u16,
'c' | 'C' => n * 16_u16 + 12_u16,
'd' | 'D' => n * 16_u16 + 13_u16,
'e' | 'E' => n * 16_u16 + 14_u16,
'f' | 'F' => n * 16_u16 + 15_u16,
_ => return self.error(
~"invalid \\u escape (unrecognized hex)")
};
i += 1u;
}
// Error out if we didn't parse 4 digits.
if i != 4u {
return self.error(
~"invalid \\u escape (not four digits)");
}
Ok(n)
}
fn parse_str(&mut self) -> DecodeResult<~str> { fn parse_str(&mut self) -> DecodeResult<~str> {
let mut escape = false; let mut escape = false;
let mut res = StrBuf::new(); let mut res = StrBuf::new();
@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
'n' => res.push_char('\n'), 'n' => res.push_char('\n'),
'r' => res.push_char('\r'), 'r' => res.push_char('\r'),
't' => res.push_char('\t'), 't' => res.push_char('\t'),
'u' => { 'u' => match try!(self.decode_hex_escape()) {
// Parse \u1234. 0xDC00 .. 0xDFFF => return self.error(
let mut i = 0u; ~"lone trailing surrogate in hex escape"),
let mut n = 0u;
while i < 4u && !self.eof() { // Non-BMP characters are encoded as a sequence of
self.bump(); // two hex escapes, representing UTF-16 surrogates.
n = match self.ch_or_null() { n1 @ 0xD800 .. 0xDBFF => {
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint), let c1 = self.next_char();
'a' | 'A' => n * 16u + 10u, let c2 = self.next_char();
'b' | 'B' => n * 16u + 11u, match (c1, c2) {
'c' | 'C' => n * 16u + 12u, (Some('\\'), Some('u')) => (),
'd' | 'D' => n * 16u + 13u,
'e' | 'E' => n * 16u + 14u,
'f' | 'F' => n * 16u + 15u,
_ => return self.error( _ => return self.error(
~"invalid \\u escape (unrecognized hex)") ~"unexpected end of non-BMP hex escape"),
};
i += 1u;
} }
// Error out if we didn't parse 4 digits. let buf = [n1, try!(self.decode_hex_escape())];
if i != 4u { match str::utf16_items(buf.as_slice()).next() {
return self.error( Some(ScalarValue(c)) => res.push_char(c),
~"invalid \\u escape (not four digits)"); _ => return self.error(
~"lone leading surrogate in hex escape"),
}
} }
res.push_char(char::from_u32(n as u32).unwrap()); n => match char::from_u32(n as u32) {
} Some(c) => res.push_char(c),
None => return self.error(
format!("invalid Unicode codepoint {:u}", n)),
},
},
_ => return self.error(~"invalid escape"), _ => return self.error(~"invalid escape"),
} }
escape = false; escape = false;
@ -2139,6 +2169,16 @@ mod tests {
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo"))); assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab"))); assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12"))); assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
// Non-BMP escapes. The exact error messages and positions are kind of
// arbitrary.
assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
assert!(from_str("\"\\ud83d\"").is_err());
assert!(from_str("\"\\udca9\"").is_err());
assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
assert!(from_str("\"\\ud83dx\"").is_err());
assert!(from_str("\"\\udca9\\udca9\"").is_err());
assert!(from_str("\"\\udca9x\"").is_err());
} }
#[test] #[test]

View File

@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio
#[cfg(test)] use str::Str; #[cfg(test)] use str::Str;
#[cfg(test)] use strbuf::StrBuf; #[cfg(test)] use strbuf::StrBuf;
#[cfg(test)] use slice::ImmutableVector;
#[cfg(not(test))] use cmp::{Eq, Ord}; #[cfg(not(test))] use cmp::{Eq, Ord};
#[cfg(not(test))] use default::Default; #[cfg(not(test))] use default::Default;
@ -560,11 +561,19 @@ pub trait Char {
/// Encodes this character as UTF-8 into the provided byte buffer. /// Encodes this character as UTF-8 into the provided byte buffer.
/// ///
/// The buffer must be at least 4 bytes long or a runtime failure will /// The buffer must be at least 4 bytes long or a runtime failure may
/// occur. /// occur.
/// ///
/// This will then return the number of characters written to the slice. /// This will then return the number of bytes written to the slice.
fn encode_utf8(&self, dst: &mut [u8]) -> uint; fn encode_utf8(&self, dst: &mut [u8]) -> uint;
/// Encodes this character as UTF-16 into the provided `u16` buffer.
///
/// The buffer must be at least 2 elements long or a runtime failure may
/// occur.
///
/// This will then return the number of `u16`s written to the slice.
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
} }
impl Char for char { impl Char for char {
@ -602,7 +611,7 @@ impl Char for char {
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) } fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint { fn encode_utf8(&self, dst: &mut [u8]) -> uint {
let code = *self as uint; let code = *self as uint;
if code < MAX_ONE_B { if code < MAX_ONE_B {
dst[0] = code as u8; dst[0] = code as u8;
@ -624,6 +633,24 @@ impl Char for char {
return 4; return 4;
} }
} }
fn encode_utf16(&self, dst: &mut [u16]) -> uint {
let mut ch = *self as uint;
if (ch & 0xFFFF_u) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
dst[0] = ch as u16;
1
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
ch -= 0x1_0000_u;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2
}
}
} }
#[cfg(not(test))] #[cfg(not(test))]
@ -788,3 +815,31 @@ fn test_to_str() {
let s = 't'.to_str(); let s = 't'.to_str();
assert_eq!(s, ~"t"); assert_eq!(s, ~"t");
} }
#[test]
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
let mut buf = [0u8, ..4];
let n = input.encode_utf8(buf /* as mut slice! */);
assert_eq!(buf.slice_to(n), expect);
}
check('x', [0x78]);
check('\u00e9', [0xc3, 0xa9]);
check('\ua66e', [0xea, 0x99, 0xae]);
check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
}
#[test]
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
let mut buf = [0u16, ..2];
let n = input.encode_utf16(buf /* as mut slice! */);
assert_eq!(buf.slice_to(n), expect);
}
check('x', [0x0078]);
check('\u00e9', [0x00e9]);
check('\ua66e', [0xa66e]);
check('\U0001f4a9', [0xd83d, 0xdca9]);
}

View File

@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str {
fn to_utf16(&self) -> ~[u16] { fn to_utf16(&self) -> ~[u16] {
let mut u = ~[]; let mut u = ~[];
for ch in self.chars() { for ch in self.chars() {
// Arithmetic with u32 literals is easier on the eyes than chars. let mut buf = [0u16, ..2];
let mut ch = ch as u32; let n = ch.encode_utf16(buf /* as mut slice! */);
u.push_all(buf.slice_to(n));
if (ch & 0xFFFF_u32) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
u.push(ch as u16)
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
ch -= 0x1_0000_u32;
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
u.push_all([w1, w2])
}
} }
u u
} }