Improve markdown parsing for the doc lint

2024-11-25 08:13:41 +00:00 · 2016-05-26 22:53:38 +02:00 · 2016-05-26 22:53:38 +02:00 · ac0bb4126c
commit ac0bb4126c
parent 71b41b6e01
2 changed files with 128 additions and 77 deletions
--- a/clippy_lints/src/doc.rs
+++ b/clippy_lints/src/doc.rs
@ -51,6 +51,8 @@ impl EarlyLintPass for Doc {
 }

 pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [ast::Attribute]) {
+    let mut docs = vec![];
+
    let mut in_multiline = false;
    for attr in attrs {
        if attr.node.is_sugared_doc {
@ -66,37 +68,20 @@ pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [a
                        in_multiline = !in_multiline;
                    }
                    if !in_multiline {
-                        check_doc(cx, valid_idents, real_doc, span);
+                        docs.push((real_doc, span));
                    }
                }
            }
        }
    }
-}

-macro_rules! jump_to {
-    // Get the next character’s first byte UTF-8 friendlyly.
-    (@next_char, $chars: expr, $len: expr) => {{
-        if let Some(&(pos, _)) = $chars.peek() {
-            pos
-        } else {
-            $len
-        }
-    }};
-
-    // Jump to the next `$c`. If no such character is found, give up.
-    ($chars: expr, $c: expr, $len: expr) => {{
-        if $chars.find(|&(_, c)| c == $c).is_some() {
-            jump_to!(@next_char, $chars, $len)
-        }
-        else {
-            return;
-        }
-    }};
+    for (doc, span) in docs {
+        let _ = check_doc(cx, valid_idents, doc, span);
+    }
 }

 #[allow(while_let_loop)] // #362
-pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) {
+pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) -> Result<(), ()> {
    // In markdown, `_` can be used to emphasize something, or, is a raw `_` depending on context.
    // There really is no markdown specification that would disambiguate this properly. This is
    // what GitHub and Rustdoc do:
@ -108,8 +93,8 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp
    // (_baz_)             → (<em>baz</em>)
    // foo _ bar _ baz     → foo _ bar _ baz

-    /// Character that can appear in a word
-    fn is_word_char(c: char) -> bool {
+    /// Character that can appear in a path
+    fn is_path_char(c: char) -> bool {
        match c {
            t if t.is_alphanumeric() => true,
            ':' | '_' => true,
@ -117,81 +102,140 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp
        }
    }

-    #[allow(cast_possible_truncation)]
-    fn word_span(mut span: Span, begin: usize, end: usize) -> Span {
-        debug_assert_eq!(end as u32 as usize, end);
-        debug_assert_eq!(begin as u32 as usize, begin);
-        span.hi = span.lo + BytePos(end as u32);
-        span.lo = span.lo + BytePos(begin as u32);
-        span
+    #[derive(Clone, Debug)]
+    struct Parser<'a> {
+        link: bool,
+        line: &'a str,
+        span: Span,
+        current_word_begin: usize,
+        new_line: bool,
+        pos: usize,
    }

-    let mut new_line = true;
-    let len = doc.len();
-    let mut chars = doc.char_indices().peekable();
-    let mut current_word_begin = 0;
+    impl<'a> Parser<'a> {
+        fn advance_begin(&mut self) {
+            self.current_word_begin = self.pos;
+        }
+
+        fn peek(&self) -> Option<char> {
+            self.line[self.pos..].chars().next()
+        }
+
+        fn jump_to(&mut self, n: char) -> Result<(), ()> {
+            while let Some(c) = self.next() {
+                if c == n {
+                    self.advance_begin();
+                    return Ok(());
+                }
+            }
+
+            return Err(());
+        }
+
+        fn put_back(&mut self, c: char) {
+            self.pos -= c.len_utf8();
+        }
+
+        #[allow(cast_possible_truncation)]
+        fn word(&self) -> (&'a str, Span) {
+            let begin = self.current_word_begin;
+            let end = self.pos;
+
+            debug_assert_eq!(end as u32 as usize, end);
+            debug_assert_eq!(begin as u32 as usize, begin);
+
+            let mut span = self.span;
+            span.hi = span.lo + BytePos(end as u32);
+            span.lo = span.lo + BytePos(begin as u32);
+
+            (&self.line[begin..end], span)
+        }
+    }
+
+    impl<'a> Iterator for Parser<'a> {
+        type Item = char;
+
+        fn next(&mut self) -> Option<char> {
+            let mut chars = self.line[self.pos..].chars();
+            let c = chars.next();
+
+            if let Some(c) = c {
+                self.pos += c.len_utf8();
+            } else {
+                // TODO: new line
+            }
+
+            c
+        }
+    }
+
+    let mut parser = Parser {
+        link: false,
+        line: doc,
+        span: span,
+        current_word_begin: 0,
+        new_line: true,
+        pos: 0,
+    };
+
    loop {
-        match chars.next() {
-            Some((_, c)) => {
+        match parser.next() {
+            Some(c) => {
                match c {
                    '#' if new_line => { // don’t warn on titles
-                        current_word_begin = jump_to!(chars, '\n', len);
+                        try!(parser.jump_to('\n'));
                    }
                    '`' => {
-                        current_word_begin = jump_to!(chars, '`', len);
+                        try!(parser.jump_to('`'));
                    }
                    '[' => {
-                        let end = jump_to!(chars, ']', len);
-                        let link_text = &doc[current_word_begin + 1..end];
-                        let word_span = word_span(span, current_word_begin + 1, end + 1);
-
-                        match chars.peek() {
-                            Some(&(_, c)) => {
-                                // Trying to parse a link. Let’s ignore the link.
-
-                                // FIXME: how does markdown handles such link?
-                                // https://en.wikipedia.org/w/index.php?title=)
-                                match c {
-                                    '(' => { // inline link
-                                        current_word_begin = jump_to!(chars, ')', len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
-                                    '[' => { // reference link
-                                        current_word_begin = jump_to!(chars, ']', len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
-                                    ':' => { // reference link
-                                        current_word_begin = jump_to!(chars, '\n', len);
-                                    }
-                                    _ => { // automatic reference link
-                                        current_word_begin = jump_to!(@next_char, chars, len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
+                        // Check for a reference definition `[foo]:` at the beginning of a line
+                        let mut link = true;
+                        if parser.new_line {
+                            let mut lookup_parser = parser.clone();
+                            if let Some(_) = lookup_parser.find(|&c| c == ']') {
+                                if let Some(':') = lookup_parser.next() {
+                                    try!(lookup_parser.jump_to(')'));
+                                    parser = lookup_parser;
+                                    link = false;
                                }
                            }
-                            None => return,
+                        }
+
+                        parser.advance_begin();
+                        parser.link = link;
+                    }
+                    ']' if parser.link => {
+                        parser.link = false;
+
+                        match parser.peek() {
+                            Some('(') => try!(parser.jump_to(')')),
+                            Some('[') => try!(parser.jump_to(']')),
+                            Some(_) => continue,
+                            None => return Err(()),
                        }
                    }
-                    // anything that’s neither alphanumeric nor '_' is not part of an ident anyway
-                    c if !c.is_alphanumeric() && c != '_' => {
-                        current_word_begin = jump_to!(@next_char, chars, len);
+                    c if !is_path_char(c) => {
+                        parser.advance_begin();
                    }
                    _ => {
-                        let end = match chars.find(|&(_, c)| !is_word_char(c)) {
-                            Some((end, _)) => end,
-                            None => len,
-                        };
-                        let word_span = word_span(span, current_word_begin, end);
-                        check_word(cx, valid_idents, &doc[current_word_begin..end], word_span);
-                        current_word_begin = jump_to!(@next_char, chars, len);
+                        if let Some(c) = parser.find(|&c| !is_path_char(c)) {
+                            parser.put_back(c);
+                        }
+
+                        let (word, span) = parser.word();
+                        check_word(cx, valid_idents, word, span);
+                        parser.advance_begin();
                    }
                }

-                new_line = c == '\n' || (new_line && c.is_whitespace());
+                parser.new_line = c == '\n' || (parser.new_line && c.is_whitespace());
            }
            None => break,
        }
    }
+
+    Ok(())
 }

 fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span) {
--- a/tests/compile-fail/doc.rs
+++ b/tests/compile-fail/doc.rs
@ -12,6 +12,8 @@
 /// Markdown is _weird_. I mean _really weird_.  This \_ is ok. So is `_`. But not Foo::some_fun
 //~^ ERROR: you should put `Foo::some_fun` between ticks
 /// which should be reported only once despite being __doubly bad__.
+/// Here be ::is::a::global:path.
+//~^ ERROR: you should put `is::a::global:path` between ticks
 /// be_sure_we_got_to_the_end_of_it
 //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks
 fn foo_bar() {
@ -141,3 +143,8 @@ fn issue900() {
 //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks
 fn issue883() {
 }
+
+/// `foo_bar
+/// baz_quz`
+fn multiline() {
+}