Auto merge of #45711 - tirr-c:unicode-span, r=estebank

Display spans correctly when there are zero-width or wide characters Hopefully... * fixes #45211 * fixes #8706 --- Before: ``` error: invalid width `7` for integer literal --> unicode_2.rs:12:25 | 12 | let _ = ("a̐éö̲", 0u7); | ^^^ | = help: valid widths are 8, 16, 32, 64 and 128 error: invalid width `42` for integer literal --> unicode_2.rs:13:20 | 13 | let _ = ("아あ", 1i42); | ^^^^ | = help: valid widths are 8, 16, 32, 64 and 128 error: aborting due to 2 previous errors ``` After: ``` error: invalid width `7` for integer literal --> unicode_2.rs:12:25 | 12 | let _ = ("a̐éö̲", 0u7); | ^^^ | = help: valid widths are 8, 16, 32, 64 and 128 error: invalid width `42` for integer literal --> unicode_2.rs:13:20 | 13 | let _ = ("아あ", 1i42); | ^^^^ | = help: valid widths are 8, 16, 32, 64 and 128 error: aborting due to 2 previous errors ``` Spans might display incorrectly on the browser. r? @estebank
2024-10-30 05:51:58 +00:00 · 2017-11-04 23:09:19 +00:00 · 2017-11-04 23:09:19 +00:00 · 12e6b53744
commit 12e6b53744
parent d762b1d6c6 272c2faa1d
14 changed files with 231 additions and 16 deletions
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@ -2230,6 +2230,7 @@ version = "0.0.0"
 dependencies = [
 "rustc_data_structures 0.0.0",
 "serialize 0.0.0",
+ "unicode-width 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
--- a/src/librustc/ich/impls_syntax.rs
+++ b/src/librustc/ich/impls_syntax.rs
@ -364,6 +364,7 @@ impl<'gcx> HashStable<StableHashingContext<'gcx>> for FileMap {
            end_pos: _,
            ref lines,
            ref multibyte_chars,
+            ref non_narrow_chars,
        } = *self;

        name.hash_stable(hcx, hasher);
@ -389,6 +390,12 @@ impl<'gcx> HashStable<StableHashingContext<'gcx>> for FileMap {
        for &char_pos in multibyte_chars.iter() {
            stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher);
        }
+
+        let non_narrow_chars = non_narrow_chars.borrow();
+        non_narrow_chars.len().hash_stable(hcx, hasher);
+        for &char_pos in non_narrow_chars.iter() {
+            stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher);
+        }
    }
 }

@ -408,3 +415,12 @@ fn stable_multibyte_char(mbc: ::syntax_pos::MultiByteChar,

    (pos.0 - filemap_start.0, bytes as u32)
 }
+
+fn stable_non_narrow_char(swc: ::syntax_pos::NonNarrowChar,
+                          filemap_start: ::syntax_pos::BytePos)
+                          -> (u32, u32) {
+    let pos = swc.pos();
+    let width = swc.width();
+
+    (pos.0 - filemap_start.0, width as u32)
+}
--- a/src/librustc_errors/emitter.rs
+++ b/src/librustc_errors/emitter.rs
@ -10,7 +10,7 @@

 use self::Destination::*;

-use syntax_pos::{DUMMY_SP, FileMap, Span, MultiSpan, CharPos};
+use syntax_pos::{DUMMY_SP, FileMap, Span, MultiSpan};

 use {Level, CodeSuggestion, DiagnosticBuilder, SubDiagnostic, CodeMapper, DiagnosticId};
 use RenderSpan::*;
@ -201,8 +201,8 @@ impl EmitterWriter {
                // 6..7. This is degenerate input, but it's best to degrade
                // gracefully -- and the parser likes to supply a span like
                // that for EOF, in particular.
-                if lo.col == hi.col && lo.line == hi.line {
-                    hi.col = CharPos(lo.col.0 + 1);
+                if lo.col_display == hi.col_display && lo.line == hi.line {
+                    hi.col_display += 1;
                }

                let ann_type = if lo.line != hi.line {
@ -210,8 +210,8 @@ impl EmitterWriter {
                        depth: 1,
                        line_start: lo.line,
                        line_end: hi.line,
-                        start_col: lo.col.0,
-                        end_col: hi.col.0,
+                        start_col: lo.col_display,
+                        end_col: hi.col_display,
                        is_primary: span_label.is_primary,
                        label: span_label.label.clone(),
                    };
@ -221,8 +221,8 @@ impl EmitterWriter {
                    AnnotationType::Singleline
                };
                let ann = Annotation {
-                    start_col: lo.col.0,
-                    end_col: hi.col.0,
+                    start_col: lo.col_display,
+                    end_col: hi.col_display,
                    is_primary: span_label.is_primary,
                    label: span_label.label.clone(),
                    annotation_type: ann_type,
--- a/src/librustc_metadata/decoder.rs
+++ b/src/librustc_metadata/decoder.rs
@ -1189,6 +1189,7 @@ impl<'a, 'tcx> CrateMetadata {
                                      end_pos,
                                      lines,
                                      multibyte_chars,
+                                      non_narrow_chars,
                                      .. } = filemap_to_import;

            let source_length = (end_pos - start_pos).to_usize();
@ -1206,6 +1207,10 @@ impl<'a, 'tcx> CrateMetadata {
            for mbc in &mut multibyte_chars {
                mbc.pos = mbc.pos - start_pos;
            }
+            let mut non_narrow_chars = non_narrow_chars.into_inner();
+            for swc in &mut non_narrow_chars {
+                *swc = *swc - start_pos;
+            }

            let local_version = local_codemap.new_imported_filemap(name,
                                                                   name_was_remapped,
@ -1213,7 +1218,8 @@ impl<'a, 'tcx> CrateMetadata {
                                                                   src_hash,
                                                                   source_length,
                                                                   lines,
-                                                                   multibyte_chars);
+                                                                   multibyte_chars,
+                                                                   non_narrow_chars);
            debug!("CrateMetaData::imported_filemaps alloc \
                    filemap {:?} original (start_pos {:?} end_pos {:?}) \
                    translated (start_pos {:?} end_pos {:?})",
--- a/src/libsyntax/codemap.rs
+++ b/src/libsyntax/codemap.rs
@ -242,7 +242,8 @@ impl CodeMap {
                                src_hash: u128,
                                source_len: usize,
                                mut file_local_lines: Vec<BytePos>,
-                                mut file_local_multibyte_chars: Vec<MultiByteChar>)
+                                mut file_local_multibyte_chars: Vec<MultiByteChar>,
+                                mut file_local_non_narrow_chars: Vec<NonNarrowChar>)
                                -> Rc<FileMap> {
        let start_pos = self.next_start_pos();
        let mut files = self.files.borrow_mut();
@ -258,6 +259,10 @@ impl CodeMap {
            mbc.pos = mbc.pos + start_pos;
        }

+        for swc in &mut file_local_non_narrow_chars {
+            *swc = *swc + start_pos;
+        }
+
        let filemap = Rc::new(FileMap {
            name: filename,
            name_was_remapped,
@ -270,6 +275,7 @@ impl CodeMap {
            end_pos,
            lines: RefCell::new(file_local_lines),
            multibyte_chars: RefCell::new(file_local_multibyte_chars),
+            non_narrow_chars: RefCell::new(file_local_non_narrow_chars),
        });

        files.push(filemap.clone());
@ -297,6 +303,24 @@ impl CodeMap {
                let line = a + 1; // Line numbers start at 1
                let linebpos = (*f.lines.borrow())[a];
                let linechpos = self.bytepos_to_file_charpos(linebpos);
+                let col = chpos - linechpos;
+
+                let col_display = {
+                    let non_narrow_chars = f.non_narrow_chars.borrow();
+                    let start_width_idx = non_narrow_chars
+                        .binary_search_by_key(&linebpos, |x| x.pos())
+                        .unwrap_or_else(|x| x);
+                    let end_width_idx = non_narrow_chars
+                        .binary_search_by_key(&pos, |x| x.pos())
+                        .unwrap_or_else(|x| x);
+                    let special_chars = end_width_idx - start_width_idx;
+                    let non_narrow: usize =
+                        non_narrow_chars[start_width_idx..end_width_idx]
+                        .into_iter()
+                        .map(|x| x.width())
+                        .sum();
+                    col.0 - special_chars + non_narrow
+                };
                debug!("byte pos {:?} is on the line at byte pos {:?}",
                       pos, linebpos);
                debug!("char pos {:?} is on the line at char pos {:?}",
@ -306,14 +330,28 @@ impl CodeMap {
                Loc {
                    file: f,
                    line,
-                    col: chpos - linechpos,
+                    col,
+                    col_display,
                }
            }
            Err(f) => {
+                let col_display = {
+                    let non_narrow_chars = f.non_narrow_chars.borrow();
+                    let end_width_idx = non_narrow_chars
+                        .binary_search_by_key(&pos, |x| x.pos())
+                        .unwrap_or_else(|x| x);
+                    let non_narrow: usize =
+                        non_narrow_chars[0..end_width_idx]
+                        .into_iter()
+                        .map(|x| x.width())
+                        .sum();
+                    chpos.0 - end_width_idx + non_narrow
+                };
                Loc {
                    file: f,
                    line: 0,
                    col: chpos,
+                    col_display,
                }
            }
        }
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@ -433,6 +433,7 @@ impl<'a> StringReader<'a> {
                    self.filemap.record_multibyte_char(self.pos, new_ch_len);
                }
            }
+            self.filemap.record_width(self.pos, new_ch);
        } else {
            self.ch = None;
            self.pos = new_pos;
--- a/src/libsyntax_pos/Cargo.toml
+++ b/src/libsyntax_pos/Cargo.toml
@ -11,3 +11,4 @@ crate-type = ["dylib"]
 [dependencies]
 serialize = { path = "../libserialize" }
 rustc_data_structures = { path = "../librustc_data_structures" }
+unicode-width = "0.1.4"
--- a/src/libsyntax_pos/lib.rs
+++ b/src/libsyntax_pos/lib.rs
@ -44,6 +44,8 @@ use serialize::{Encodable, Decodable, Encoder, Decoder};
 extern crate serialize;
 extern crate serialize as rustc_serialize; // used by deriving

+extern crate unicode_width;
+
 pub mod hygiene;
 pub use hygiene::{SyntaxContext, ExpnInfo, ExpnFormat, NameAndSpan, CompilerDesugaringKind};

@ -494,6 +496,63 @@ pub struct MultiByteChar {
    pub bytes: usize,
 }

+/// Identifies an offset of a non-narrow character in a FileMap
+#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)]
+pub enum NonNarrowChar {
+    /// Represents a zero-width character
+    ZeroWidth(BytePos),
+    /// Represents a wide (fullwidth) character
+    Wide(BytePos),
+}
+
+impl NonNarrowChar {
+    fn new(pos: BytePos, width: usize) -> Self {
+        match width {
+            0 => NonNarrowChar::ZeroWidth(pos),
+            2 => NonNarrowChar::Wide(pos),
+            _ => panic!("width {} given for non-narrow character", width),
+        }
+    }
+
+    /// Returns the absolute offset of the character in the CodeMap
+    pub fn pos(&self) -> BytePos {
+        match *self {
+            NonNarrowChar::ZeroWidth(p) |
+            NonNarrowChar::Wide(p) => p,
+        }
+    }
+
+    /// Returns the width of the character, 0 (zero-width) or 2 (wide)
+    pub fn width(&self) -> usize {
+        match *self {
+            NonNarrowChar::ZeroWidth(_) => 0,
+            NonNarrowChar::Wide(_) => 2,
+        }
+    }
+}
+
+impl Add<BytePos> for NonNarrowChar {
+    type Output = Self;
+
+    fn add(self, rhs: BytePos) -> Self {
+        match self {
+            NonNarrowChar::ZeroWidth(pos) => NonNarrowChar::ZeroWidth(pos + rhs),
+            NonNarrowChar::Wide(pos) => NonNarrowChar::Wide(pos + rhs),
+        }
+    }
+}
+
+impl Sub<BytePos> for NonNarrowChar {
+    type Output = Self;
+
+    fn sub(self, rhs: BytePos) -> Self {
+        match self {
+            NonNarrowChar::ZeroWidth(pos) => NonNarrowChar::ZeroWidth(pos - rhs),
+            NonNarrowChar::Wide(pos) => NonNarrowChar::Wide(pos - rhs),
+        }
+    }
+}
+
 /// The state of the lazy external source loading mechanism of a FileMap.
 #[derive(PartialEq, Eq, Clone)]
 pub enum ExternalSource {
@ -552,11 +611,13 @@ pub struct FileMap {
    pub lines: RefCell<Vec<BytePos>>,
    /// Locations of multi-byte characters in the source code
    pub multibyte_chars: RefCell<Vec<MultiByteChar>>,
+    /// Width of characters that are not narrow in the source code
+    pub non_narrow_chars: RefCell<Vec<NonNarrowChar>>,
 }

 impl Encodable for FileMap {
    fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
-        s.emit_struct("FileMap", 7, |s| {
+        s.emit_struct("FileMap", 8, |s| {
            s.emit_struct_field("name", 0, |s| self.name.encode(s))?;
            s.emit_struct_field("name_was_remapped", 1, |s| self.name_was_remapped.encode(s))?;
            s.emit_struct_field("src_hash", 6, |s| self.src_hash.encode(s))?;
@ -610,6 +671,9 @@ impl Encodable for FileMap {
            })?;
            s.emit_struct_field("multibyte_chars", 5, |s| {
                (*self.multibyte_chars.borrow()).encode(s)
+            })?;
+            s.emit_struct_field("non_narrow_chars", 7, |s| {
+                (*self.non_narrow_chars.borrow()).encode(s)
            })
        })
    }
@ -618,7 +682,7 @@ impl Encodable for FileMap {
 impl Decodable for FileMap {
    fn decode<D: Decoder>(d: &mut D) -> Result<FileMap, D::Error> {

-        d.read_struct("FileMap", 6, |d| {
+        d.read_struct("FileMap", 8, |d| {
            let name: String = d.read_struct_field("name", 0, |d| Decodable::decode(d))?;
            let name_was_remapped: bool =
                d.read_struct_field("name_was_remapped", 1, |d| Decodable::decode(d))?;
@ -657,6 +721,8 @@ impl Decodable for FileMap {
            })?;
            let multibyte_chars: Vec<MultiByteChar> =
                d.read_struct_field("multibyte_chars", 5, |d| Decodable::decode(d))?;
+            let non_narrow_chars: Vec<NonNarrowChar> =
+                d.read_struct_field("non_narrow_chars", 7, |d| Decodable::decode(d))?;
            Ok(FileMap {
                name,
                name_was_remapped,
@ -671,7 +737,8 @@ impl Decodable for FileMap {
                src_hash,
                external_src: RefCell::new(ExternalSource::AbsentOk),
                lines: RefCell::new(lines),
-                multibyte_chars: RefCell::new(multibyte_chars)
+                multibyte_chars: RefCell::new(multibyte_chars),
+                non_narrow_chars: RefCell::new(non_narrow_chars)
            })
        })
    }
@ -709,6 +776,7 @@ impl FileMap {
            end_pos: Pos::from_usize(end_pos),
            lines: RefCell::new(Vec::new()),
            multibyte_chars: RefCell::new(Vec::new()),
+            non_narrow_chars: RefCell::new(Vec::new()),
        }
    }

@ -798,6 +866,23 @@ impl FileMap {
        self.multibyte_chars.borrow_mut().push(mbc);
    }

+    pub fn record_width(&self, pos: BytePos, ch: char) {
+        let width = match ch {
+            '\t' | '\n' =>
+                // Tabs will consume one column.
+                // Make newlines take one column so that displayed spans can point them.
+                1,
+            ch =>
+                // Assume control characters are zero width.
+                // FIXME: How can we decide between `width` and `width_cjk`?
+                unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0),
+        };
+        // Only record non-narrow characters.
+        if width != 1 {
+            self.non_narrow_chars.borrow_mut().push(NonNarrowChar::new(pos, width));
+        }
+    }
+
    pub fn is_real_file(&self) -> bool {
        !(self.name.starts_with("<") &&
          self.name.ends_with(">"))
@ -944,7 +1029,9 @@ pub struct Loc {
    /// The (1-based) line number
    pub line: usize,
    /// The (0-based) column offset
-    pub col: CharPos
+    pub col: CharPos,
+    /// The (0-based) column offset when displayed
+    pub col_display: usize,
 }

 /// A source code location used as the result of lookup_char_pos_adj
--- a/src/test/ui/codemap_tests/unicode.stderr
+++ b/src/test/ui/codemap_tests/unicode.stderr
@ -2,7 +2,7 @@ error: invalid ABI: expected one of [cdecl, stdcall, fastcall, vectorcall, thisc
  --> $DIR/unicode.rs:11:8
   |
 11 | extern "路濫狼á́́" fn foo() {}
-   |        ^^^^^^^^
+   |        ^^^^^^^^^

 error: aborting due to previous error

--- a/src/test/ui/codemap_tests/unicode_2.rs
+++ b/src/test/ui/codemap_tests/unicode_2.rs
@ -0,0 +1,17 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![feature(non_ascii_idents)]
+
+fn main() {
+    let _ = ("a̐éö̲", 0u7);
+    let _ = ("아あ", 1i42);
+    let _ = a̐é;
+}
--- a/src/test/ui/codemap_tests/unicode_2.stderr
+++ b/src/test/ui/codemap_tests/unicode_2.stderr
@ -0,0 +1,24 @@
+error: invalid width `7` for integer literal
+  --> $DIR/unicode_2.rs:14:25
+   |
+14 |     let _ = ("a̐éö̲", 0u7);
+   |                     ^^^
+   |
+   = help: valid widths are 8, 16, 32, 64 and 128
+
+error: invalid width `42` for integer literal
+  --> $DIR/unicode_2.rs:15:20
+   |
+15 |     let _ = ("아あ", 1i42);
+   |                      ^^^^
+   |
+   = help: valid widths are 8, 16, 32, 64 and 128
+
+error[E0425]: cannot find value `a̐é` in this scope
+  --> $DIR/unicode_2.rs:16:13
+   |
+16 |     let _ = a̐é;
+   |             ^^ not found in this scope
+
+error: aborting due to 3 previous errors
+
--- a/src/test/ui/codemap_tests/unicode_3.rs
+++ b/src/test/ui/codemap_tests/unicode_3.rs
@ -0,0 +1,14 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+fn main() {
+    let s = "ZͨA͑ͦ͒͋ͤ͑̚L̄͑͋Ĝͨͥ̿͒̽̈́Oͥ͛ͭ!̏"; while true { break; }
+    println!("{}", s);
+}
--- a/src/test/ui/codemap_tests/unicode_3.stderr
+++ b/src/test/ui/codemap_tests/unicode_3.stderr
@ -0,0 +1,10 @@
+warning: denote infinite loops with `loop { ... }`
+  --> $DIR/unicode_3.rs:12:45
+   |
+12 |     let s = "ZͨA͑ͦ͒͋ͤ͑̚L̄͑͋Ĝͨͥ̿͒̽̈́Oͥ͛ͭ!̏"; while true { break; }
+   |                       ----------^^^^^^^^^^^
+   |                       |
+   |                       help: use `loop`
+   |
+   = note: #[warn(while_true)] on by default
+
--- a/src/test/ui/issue-44078.stderr
+++ b/src/test/ui/issue-44078.stderr
@ -2,7 +2,7 @@ error: unterminated double quote string
  --> $DIR/issue-44078.rs:12:8
   |
 12 |       "😊"";
-   |  ________^
+   |  _________^
 13 | | }
   | |__^