std: Deny overlong encodings in UTF-8

An 'overlong encoding' is a codepoint encoded non-minimally using the utf-8 format. Denying these enforce each codepoint to have only one valid representation in utf-8. An example is byte sequence 0xE0 0x80 0x80 which could be interpreted as U+0, but it's an overlong encoding since the canonical form is just 0x00. Another example is 0xE0 0x80 0xAF which was previously accepted and is an overlong encoding of the solidus "/". Directory traversal characters like / and . form the most compelling argument for why this commit is security critical. Factor out common UTF-8 decoding expressions as macros. This commit will partly duplicate UTF-8 decoding, so it is now present in both fn is_utf8() and .char_range_at(); the latter using an assumption of a valid str.
2025-05-14 02:49:40 +00:00 · 2013-07-30 18:39:31 +02:00 · 2013-07-30 18:39:31 +02:00 · b4ff95599a
commit b4ff95599a
parent 6dd185930d
1 changed files with 45 additions and 8 deletions
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@ -564,6 +564,18 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
 Section: Misc
 */

+// Return the initial codepoint accumulator for the first byte.
+// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+// for width 3, and 3 bits for width 4
+macro_rules! utf8_first_byte(
+    ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
+)
+
+// return the value of $ch updated with continuation byte $byte
+macro_rules! utf8_acc_cont_byte(
+    ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
+)
+
 /// Determines if a vector of bytes contains valid UTF-8
 pub fn is_utf8(v: &[u8]) -> bool {
    let mut i = 0u;
@ -577,11 +589,26 @@ pub fn is_utf8(v: &[u8]) -> bool {

            let nexti = i + w;
            if nexti > total { return false; }
+            // 1. Make sure the correct number of continuation bytes are present
+            // 2. Check codepoint ranges (deny overlong encodings)
+            //    2-byte encoding is for codepoints  \u0080 to  \u07ff
+            //    3-byte encoding is for codepoints  \u0800 to  \uffff
+            //    4-byte encoding is for codepoints \u10000 to \u10ffff

+            //    2-byte encodings are correct if the width and continuation match up
            if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
            if w > 2 {
+                let mut ch;
+                ch = utf8_first_byte!(v[i], w);
+                ch = utf8_acc_cont_byte!(ch, v[i + 1]);
                if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
-                if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
+                ch = utf8_acc_cont_byte!(ch, v[i + 2]);
+                if w == 3 && ch < MAX_TWO_B { return false; }
+                if w > 3 {
+                    if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; }
+                    ch = utf8_acc_cont_byte!(ch, v[i + 3]);
+                    if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; }
+                }
            }

            i = nexti;
@ -738,6 +765,7 @@ static MAX_TWO_B: uint = 2048u;
 static TAG_THREE_B: uint = 224u;
 static MAX_THREE_B: uint = 65536u;
 static TAG_FOUR_B: uint = 240u;
+static MAX_UNICODE: uint = 1114112u;

 /// Unsafe operations
 pub mod raw {
@ -1665,12 +1693,10 @@ impl<'self> StrSlice<'self> for &'self str {
            let w = UTF8_CHAR_WIDTH[val] as uint;
            assert!((w != 0));

-            // First byte is special, only want bottom 5 bits for width 2, 4 bits
-            // for width 3, and 3 bits for width 4
-            val &= 0x7Fu >> w;
-            val = (val << 6) | (s[i + 1] & 63u8) as uint;
-            if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
-            if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
+            val = utf8_first_byte!(val, w);
+            val = utf8_acc_cont_byte!(val, s[i + 1]);
+            if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
+            if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }

            return CharRange {ch: val as char, next: i + w};
        }
@ -2035,7 +2061,7 @@ impl OwnedStr for ~str {
    /// Appends a character to the back of a string
    #[inline]
    fn push_char(&mut self, c: char) {
-        assert!(c as uint <= 0x10ffff); // FIXME: #7609: should be enforced on all `char`
+        assert!((c as uint) < MAX_UNICODE); // FIXME: #7609: should be enforced on all `char`
        unsafe {
            let code = c as uint;
            let nb = if code < MAX_ONE_B { 1u }
@ -2802,6 +2828,17 @@ mod tests {
        assert_eq!(ss, from_bytes(bb));
    }

+    #[test]
+    fn test_is_utf8_deny_overlong() {
+        assert!(!is_utf8([0xc0, 0x80]));
+        assert!(!is_utf8([0xc0, 0xae]));
+        assert!(!is_utf8([0xe0, 0x80, 0x80]));
+        assert!(!is_utf8([0xe0, 0x80, 0xaf]));
+        assert!(!is_utf8([0xe0, 0x81, 0x81]));
+        assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
+    }
+
+
    #[test]
    #[ignore(cfg(windows))]
    fn test_from_bytes_fail() {