mirror of
https://github.com/rust-lang/rust.git
synced 2025-02-25 13:24:22 +00:00
auto merge of #13469 : kmcallister/rust/utf16, r=huonw
This fixes two separate issues related to character encoding. * Add `encode_utf16` to the `Char` trait, analogous to `encode_utf8`. `&str` already supports UTF-16 encoding but only with a heap allocation. Also fix `encode_utf8` docs and add tests. * Correctly decode non-BMP hex escapes in JSON (#13064).
This commit is contained in:
commit
4c62ab109b
@ -239,6 +239,7 @@ use std::io::MemWriter;
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::num;
|
use std::num;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
use std::str::ScalarValue;
|
||||||
use std::strbuf::StrBuf;
|
use std::strbuf::StrBuf;
|
||||||
|
|
||||||
use Encodable;
|
use Encodable;
|
||||||
@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
|
|||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
|
||||||
|
let mut i = 0u;
|
||||||
|
let mut n = 0u16;
|
||||||
|
while i < 4u && !self.eof() {
|
||||||
|
self.bump();
|
||||||
|
n = match self.ch_or_null() {
|
||||||
|
c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
|
||||||
|
'a' | 'A' => n * 16_u16 + 10_u16,
|
||||||
|
'b' | 'B' => n * 16_u16 + 11_u16,
|
||||||
|
'c' | 'C' => n * 16_u16 + 12_u16,
|
||||||
|
'd' | 'D' => n * 16_u16 + 13_u16,
|
||||||
|
'e' | 'E' => n * 16_u16 + 14_u16,
|
||||||
|
'f' | 'F' => n * 16_u16 + 15_u16,
|
||||||
|
_ => return self.error(
|
||||||
|
~"invalid \\u escape (unrecognized hex)")
|
||||||
|
};
|
||||||
|
|
||||||
|
i += 1u;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error out if we didn't parse 4 digits.
|
||||||
|
if i != 4u {
|
||||||
|
return self.error(
|
||||||
|
~"invalid \\u escape (not four digits)");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_str(&mut self) -> DecodeResult<~str> {
|
fn parse_str(&mut self) -> DecodeResult<~str> {
|
||||||
let mut escape = false;
|
let mut escape = false;
|
||||||
let mut res = StrBuf::new();
|
let mut res = StrBuf::new();
|
||||||
@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
|
|||||||
'n' => res.push_char('\n'),
|
'n' => res.push_char('\n'),
|
||||||
'r' => res.push_char('\r'),
|
'r' => res.push_char('\r'),
|
||||||
't' => res.push_char('\t'),
|
't' => res.push_char('\t'),
|
||||||
'u' => {
|
'u' => match try!(self.decode_hex_escape()) {
|
||||||
// Parse \u1234.
|
0xDC00 .. 0xDFFF => return self.error(
|
||||||
let mut i = 0u;
|
~"lone trailing surrogate in hex escape"),
|
||||||
let mut n = 0u;
|
|
||||||
while i < 4u && !self.eof() {
|
// Non-BMP characters are encoded as a sequence of
|
||||||
self.bump();
|
// two hex escapes, representing UTF-16 surrogates.
|
||||||
n = match self.ch_or_null() {
|
n1 @ 0xD800 .. 0xDBFF => {
|
||||||
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
|
let c1 = self.next_char();
|
||||||
'a' | 'A' => n * 16u + 10u,
|
let c2 = self.next_char();
|
||||||
'b' | 'B' => n * 16u + 11u,
|
match (c1, c2) {
|
||||||
'c' | 'C' => n * 16u + 12u,
|
(Some('\\'), Some('u')) => (),
|
||||||
'd' | 'D' => n * 16u + 13u,
|
|
||||||
'e' | 'E' => n * 16u + 14u,
|
|
||||||
'f' | 'F' => n * 16u + 15u,
|
|
||||||
_ => return self.error(
|
_ => return self.error(
|
||||||
~"invalid \\u escape (unrecognized hex)")
|
~"unexpected end of non-BMP hex escape"),
|
||||||
};
|
}
|
||||||
|
|
||||||
i += 1u;
|
let buf = [n1, try!(self.decode_hex_escape())];
|
||||||
|
match str::utf16_items(buf.as_slice()).next() {
|
||||||
|
Some(ScalarValue(c)) => res.push_char(c),
|
||||||
|
_ => return self.error(
|
||||||
|
~"lone leading surrogate in hex escape"),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Error out if we didn't parse 4 digits.
|
n => match char::from_u32(n as u32) {
|
||||||
if i != 4u {
|
Some(c) => res.push_char(c),
|
||||||
return self.error(
|
None => return self.error(
|
||||||
~"invalid \\u escape (not four digits)");
|
format!("invalid Unicode codepoint {:u}", n)),
|
||||||
}
|
},
|
||||||
|
},
|
||||||
res.push_char(char::from_u32(n as u32).unwrap());
|
|
||||||
}
|
|
||||||
_ => return self.error(~"invalid escape"),
|
_ => return self.error(~"invalid escape"),
|
||||||
}
|
}
|
||||||
escape = false;
|
escape = false;
|
||||||
@ -2139,6 +2169,16 @@ mod tests {
|
|||||||
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
|
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
|
||||||
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
|
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
|
||||||
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
|
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
|
||||||
|
|
||||||
|
// Non-BMP escapes. The exact error messages and positions are kind of
|
||||||
|
// arbitrary.
|
||||||
|
assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
|
||||||
|
assert!(from_str("\"\\ud83d\"").is_err());
|
||||||
|
assert!(from_str("\"\\udca9\"").is_err());
|
||||||
|
assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
|
||||||
|
assert!(from_str("\"\\ud83dx\"").is_err());
|
||||||
|
assert!(from_str("\"\\udca9\\udca9\"").is_err());
|
||||||
|
assert!(from_str("\"\\udca9x\"").is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio
|
|||||||
|
|
||||||
#[cfg(test)] use str::Str;
|
#[cfg(test)] use str::Str;
|
||||||
#[cfg(test)] use strbuf::StrBuf;
|
#[cfg(test)] use strbuf::StrBuf;
|
||||||
|
#[cfg(test)] use slice::ImmutableVector;
|
||||||
|
|
||||||
#[cfg(not(test))] use cmp::{Eq, Ord};
|
#[cfg(not(test))] use cmp::{Eq, Ord};
|
||||||
#[cfg(not(test))] use default::Default;
|
#[cfg(not(test))] use default::Default;
|
||||||
@ -560,11 +561,19 @@ pub trait Char {
|
|||||||
|
|
||||||
/// Encodes this character as UTF-8 into the provided byte buffer.
|
/// Encodes this character as UTF-8 into the provided byte buffer.
|
||||||
///
|
///
|
||||||
/// The buffer must be at least 4 bytes long or a runtime failure will
|
/// The buffer must be at least 4 bytes long or a runtime failure may
|
||||||
/// occur.
|
/// occur.
|
||||||
///
|
///
|
||||||
/// This will then return the number of characters written to the slice.
|
/// This will then return the number of bytes written to the slice.
|
||||||
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
|
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
|
||||||
|
|
||||||
|
/// Encodes this character as UTF-16 into the provided `u16` buffer.
|
||||||
|
///
|
||||||
|
/// The buffer must be at least 2 elements long or a runtime failure may
|
||||||
|
/// occur.
|
||||||
|
///
|
||||||
|
/// This will then return the number of `u16`s written to the slice.
|
||||||
|
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Char for char {
|
impl Char for char {
|
||||||
@ -602,7 +611,7 @@ impl Char for char {
|
|||||||
|
|
||||||
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
|
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
|
||||||
|
|
||||||
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
|
fn encode_utf8(&self, dst: &mut [u8]) -> uint {
|
||||||
let code = *self as uint;
|
let code = *self as uint;
|
||||||
if code < MAX_ONE_B {
|
if code < MAX_ONE_B {
|
||||||
dst[0] = code as u8;
|
dst[0] = code as u8;
|
||||||
@ -624,6 +633,24 @@ impl Char for char {
|
|||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn encode_utf16(&self, dst: &mut [u16]) -> uint {
|
||||||
|
let mut ch = *self as uint;
|
||||||
|
if (ch & 0xFFFF_u) == ch {
|
||||||
|
// The BMP falls through (assuming non-surrogate, as it
|
||||||
|
// should)
|
||||||
|
assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
|
||||||
|
dst[0] = ch as u16;
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
// Supplementary planes break into surrogates.
|
||||||
|
assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
|
||||||
|
ch -= 0x1_0000_u;
|
||||||
|
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
|
||||||
|
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
|
||||||
|
2
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(test))]
|
#[cfg(not(test))]
|
||||||
@ -788,3 +815,31 @@ fn test_to_str() {
|
|||||||
let s = 't'.to_str();
|
let s = 't'.to_str();
|
||||||
assert_eq!(s, ~"t");
|
assert_eq!(s, ~"t");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encode_utf8() {
|
||||||
|
fn check(input: char, expect: &[u8]) {
|
||||||
|
let mut buf = [0u8, ..4];
|
||||||
|
let n = input.encode_utf8(buf /* as mut slice! */);
|
||||||
|
assert_eq!(buf.slice_to(n), expect);
|
||||||
|
}
|
||||||
|
|
||||||
|
check('x', [0x78]);
|
||||||
|
check('\u00e9', [0xc3, 0xa9]);
|
||||||
|
check('\ua66e', [0xea, 0x99, 0xae]);
|
||||||
|
check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encode_utf16() {
|
||||||
|
fn check(input: char, expect: &[u16]) {
|
||||||
|
let mut buf = [0u16, ..2];
|
||||||
|
let n = input.encode_utf16(buf /* as mut slice! */);
|
||||||
|
assert_eq!(buf.slice_to(n), expect);
|
||||||
|
}
|
||||||
|
|
||||||
|
check('x', [0x0078]);
|
||||||
|
check('\u00e9', [0x00e9]);
|
||||||
|
check('\ua66e', [0xa66e]);
|
||||||
|
check('\U0001f4a9', [0xd83d, 0xdca9]);
|
||||||
|
}
|
||||||
|
@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str {
|
|||||||
fn to_utf16(&self) -> ~[u16] {
|
fn to_utf16(&self) -> ~[u16] {
|
||||||
let mut u = ~[];
|
let mut u = ~[];
|
||||||
for ch in self.chars() {
|
for ch in self.chars() {
|
||||||
// Arithmetic with u32 literals is easier on the eyes than chars.
|
let mut buf = [0u16, ..2];
|
||||||
let mut ch = ch as u32;
|
let n = ch.encode_utf16(buf /* as mut slice! */);
|
||||||
|
u.push_all(buf.slice_to(n));
|
||||||
if (ch & 0xFFFF_u32) == ch {
|
|
||||||
// The BMP falls through (assuming non-surrogate, as it
|
|
||||||
// should)
|
|
||||||
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
|
|
||||||
u.push(ch as u16)
|
|
||||||
} else {
|
|
||||||
// Supplementary planes break into surrogates.
|
|
||||||
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
|
|
||||||
ch -= 0x1_0000_u32;
|
|
||||||
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
|
|
||||||
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
|
|
||||||
u.push_all([w1, w2])
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
u
|
u
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user