rustdoc: refactor rustdoc syntax highlighting for a more flexible API

Clients can now use the rustdoc syntax highlighter to classify tokens, then use that info to put together there own HTML (or whatever), rather than just having static HTML output.
2024-11-25 16:24:46 +00:00 · 2016-05-02 10:53:24 +12:00 · 2016-05-02 10:53:24 +12:00 · 25160af4b4
commit 25160af4b4
parent a4b0481d1c
1 changed files with 245 additions and 105 deletions
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@ -1,4 +1,4 @@
-// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@ -8,16 +8,26 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-//! Basic html highlighting functionality
+//! Basic syntax highlighting functionality.
 //!
 //! This module uses libsyntax's lexer to provide token-based highlighting for
 //! the HTML documentation generated by rustdoc.
+//!
+//! If you just want to syntax highlighting for a Rust program, then you can use
+//! the `render_inner_with_highlighting` or `render_with_highlighting`
+//! functions. For more advanced use cases (if you want to supply your own css
+//! classes or control how the HTML is generated, or even generate something
+//! other then HTML), then you should implement the the `Writer` trait and use a
+//! `Classifier`.

 use html::escape::Escape;

+use std::fmt::Display;
 use std::io;
 use std::io::prelude::*;
-use syntax::parse::lexer::{self, Reader};
+
+use syntax::codemap::{CodeMap, Span};
+use syntax::parse::lexer::{self, Reader, TokenAndSpan};
 use syntax::parse::token;
 use syntax::parse;

@ -29,11 +39,13 @@ pub fn render_with_highlighting(src: &str, class: Option<&str>, id: Option<&str>

    let mut out = Vec::new();
    write_header(class, id, &mut out).unwrap();
-    if let Err(_) = write_source(&sess,
-                                 lexer::StringReader::new(&sess.span_diagnostic, fm),
-                                 &mut out) {
-        return format!("<pre>{}</pre>", src)
+
+    let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
+                                         sess.codemap());
+    if let Err(_) = classifier.write_source(&mut out) {
+        return format!("<pre>{}</pre>", src);
    }
+
    write_footer(&mut out).unwrap();
    String::from_utf8_lossy(&out[..]).into_owned()
 }
@ -46,84 +58,187 @@ pub fn render_inner_with_highlighting(src: &str) -> io::Result<String> {
    let fm = sess.codemap().new_filemap("<stdin>".to_string(), src.to_string());

    let mut out = Vec::new();
-    write_source(&sess,
-                 lexer::StringReader::new(&sess.span_diagnostic, fm),
-                 &mut out)?;
-    Ok(String::from_utf8_lossy(&out[..]).into_owned())
+    let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
+                                         sess.codemap());
+    classifier.write_source(&mut out)?;
+
+    Ok(String::from_utf8_lossy(&out).into_owned())
 }

-/// Exhausts the `lexer` writing the output into `out`.
+/// Processes a program (nested in the internal `lexer`), classifying strings of
+/// text by highlighting category (`Class`). Calls out to a `Writer` to write
+/// each span of text in sequence.
+pub struct Classifier<'a> {
+    lexer: lexer::StringReader<'a>,
+    codemap: &'a CodeMap,
+
+    // State of the classifier.
+    in_attribute: bool,
+    in_macro: bool,
+    in_macro_nonterminal: bool,
+}
+
+/// How a span of text is classified. Mostly corresponds to token kinds.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Class {
+    None,
+    Comment,
+    DocComment,
+    Attribute,
+    KeyWord,
+    // Keywords that do pointer/reference stuff.
+    RefKeyWord,
+    Self_,
+    Op,
+    Macro,
+    MacroNonTerminal,
+    String,
+    Number,
+    Bool,
+    Ident,
+    Lifetime,
+    PreludeTy,
+    PreludeVal,
+}
+
+/// Trait that controls writing the output of syntax highlighting. Users should
+/// implement this trait to customise writing output.
 ///
-/// The general structure for this method is to iterate over each token,
-/// possibly giving it an HTML span with a class specifying what flavor of token
-/// it's used. All source code emission is done as slices from the source map,
-/// not from the tokens themselves, in order to stay true to the original
-/// source.
-fn write_source(sess: &parse::ParseSess,
-                mut lexer: lexer::StringReader,
-                out: &mut Write)
-                -> io::Result<()> {
-    let mut is_attribute = false;
-    let mut is_macro = false;
-    let mut is_macro_nonterminal = false;
-    loop {
-        let next = match lexer.try_next_token() {
-            Ok(tok) => tok,
-            Err(_) => {
-                lexer.emit_fatal_errors();
-                lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting")
-                                     .note("You probably did not intend to render this \
-                                            as a rust code-block")
-                                     .emit();
-                return Err(io::Error::new(io::ErrorKind::Other, ""))
-            },
-        };
+/// The classifier will call into the `Writer` implementation as it finds spans
+/// of text to highlight. Exactly how that text should be highlighted is up to
+/// the implemention.
+pub trait Writer {
+    /// Called when we start processing a span of text that should be highlighted.
+    /// The `Class` argument specifies how it should be highlighted.
+    fn enter_span(&mut self, Class) -> io::Result<()>;

-        let snip = |sp| sess.codemap().span_to_snippet(sp).unwrap();
+    /// Called at the end of a span of highlighted text.
+    fn exit_span(&mut self) -> io::Result<()>;

-        if next.tok == token::Eof { break }
+    /// Called for a span of text, usually, but not always, a single token. If
+    /// the string of text (`T`) does correspond to a token, then the token will
+    /// also be passed. If the text should be highlighted differently from the
+    /// surrounding text, then the `Class` argument will be a value other than
+    /// `None`.
+    /// The following sequences of callbacks are equivalent:
+    /// ```plain
+    ///     enter_span(Foo), string("text", None), exit_span()
+    ///     string("text", Foo)
+    /// ```
+    /// The latter can be thought of as a shorthand for the former, which is
+    /// more flexible.
+    fn string<T: Display>(&mut self, T, Class, Option<&TokenAndSpan>) -> io::Result<()>;
+}

-        let klass = match next.tok {
-            token::Whitespace => {
-                write!(out, "{}", Escape(&snip(next.sp)))?;
-                continue
-            },
-            token::Comment => {
-                write!(out, "<span class='comment'>{}</span>",
-                       Escape(&snip(next.sp)))?;
-                continue
-            },
+// Implement `Writer` for anthing that can be written to, this just implements
+// the default rustdoc behaviour.
+impl<U: Write> Writer for U {
+    fn string<T: Display>(&mut self,
+                          text: T,
+                          klass: Class,
+                          _tas: Option<&TokenAndSpan>)
+                          -> io::Result<()> {
+        match klass {
+            Class::None => write!(self, "{}", text),
+            klass => write!(self, "<span class='{}'>{}</span>", klass.rustdoc_class(), text),
+        }
+    }
+
+    fn enter_span(&mut self, klass: Class) -> io::Result<()> {
+        write!(self, "<span class='{}'>", klass.rustdoc_class())
+    }
+
+    fn exit_span(&mut self) -> io::Result<()> {
+        write!(self, "</span>")
+    }
+}
+
+impl<'a> Classifier<'a> {
+    pub fn new(lexer: lexer::StringReader<'a>, codemap: &'a CodeMap) -> Classifier<'a> {
+        Classifier {
+            lexer: lexer,
+            codemap: codemap,
+            in_attribute: false,
+            in_macro: false,
+            in_macro_nonterminal: false,
+        }
+    }
+
+    /// Exhausts the `lexer` writing the output into `out`.
+    ///
+    /// The general structure for this method is to iterate over each token,
+    /// possibly giving it an HTML span with a class specifying what flavor of token
+    /// is used. All source code emission is done as slices from the source map,
+    /// not from the tokens themselves, in order to stay true to the original
+    /// source.
+    pub fn write_source<W: Writer>(&mut self,
+                                   out: &mut W)
+                                   -> io::Result<()> {
+        loop {
+            let next = match self.lexer.try_next_token() {
+                Ok(tas) => tas,
+                Err(_) => {
+                    self.lexer.emit_fatal_errors();
+                    self.lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting")
+                                              .note("You probably did not intend to render this \
+                                                     as a rust code-block")
+                                              .emit();
+                    return Err(io::Error::new(io::ErrorKind::Other, ""));
+                }
+            };
+
+            if next.tok == token::Eof {
+                break;
+            }
+
+            self.write_token(out, next)?;
+        }
+
+        Ok(())
+    }
+
+    // Handles an individual token from the lexer.
+    fn write_token<W: Writer>(&mut self,
+                              out: &mut W,
+                              tas: TokenAndSpan)
+                              -> io::Result<()> {
+        let klass = match tas.tok {
            token::Shebang(s) => {
-                write!(out, "{}", Escape(&s.as_str()))?;
-                continue
+                out.string(Escape(&s.as_str()), Class::None, Some(&tas))?;
+                return Ok(());
            },
+
+            token::Whitespace => Class::None,
+            token::Comment => Class::Comment,
+            token::DocComment(..) => Class::DocComment,
+
            // If this '&' token is directly adjacent to another token, assume
            // that it's the address-of operator instead of the and-operator.
-            // This allows us to give all pointers their own class (`Box` and
-            // `@` are below).
-            token::BinOp(token::And) if lexer.peek().sp.lo == next.sp.hi => "kw-2",
-            token::At | token::Tilde => "kw-2",
+            token::BinOp(token::And) if self.lexer.peek().sp.lo == tas.sp.hi => Class::RefKeyWord,

-            // consider this as part of a macro invocation if there was a
-            // leading identifier
-            token::Not if is_macro => { is_macro = false; "macro" }
+            // Consider this as part of a macro invocation if there was a
+            // leading identifier.
+            token::Not if self.in_macro => {
+                self.in_macro = false;
+                Class::Macro
+            }

-            // operators
+            // Operators.
            token::Eq | token::Lt | token::Le | token::EqEq | token::Ne | token::Ge | token::Gt |
                token::AndAnd | token::OrOr | token::Not | token::BinOp(..) | token::RArrow |
-                token::BinOpEq(..) | token::FatArrow => "op",
+                token::BinOpEq(..) | token::FatArrow => Class::Op,

-            // miscellaneous, no highlighting
+            // Miscellaneous, no highlighting.
            token::Dot | token::DotDot | token::DotDotDot | token::Comma | token::Semi |
                token::Colon | token::ModSep | token::LArrow | token::OpenDelim(_) |
                token::CloseDelim(token::Brace) | token::CloseDelim(token::Paren) |
-                token::Question => "",
+                token::Question => Class::None,
            token::Dollar => {
-                if lexer.peek().tok.is_ident() {
-                    is_macro_nonterminal = true;
-                    "macro-nonterminal"
+                if self.lexer.peek().tok.is_ident() {
+                    self.in_macro_nonterminal = true;
+                    Class::MacroNonTerminal
                } else {
-                    ""
+                    Class::None
                }
            }

@ -132,78 +247,103 @@ fn write_source(sess: &parse::ParseSess,
            // seen, so skip out early. Down below we terminate the attribute
            // span when we see the ']'.
            token::Pound => {
-                is_attribute = true;
-                write!(out, r"<span class='attribute'>#")?;
-                continue
+                self.in_attribute = true;
+                out.enter_span(Class::Attribute)?;
+                out.string("#", Class::None, None)?;
+                return Ok(());
            }
            token::CloseDelim(token::Bracket) => {
-                if is_attribute {
-                    is_attribute = false;
-                    write!(out, "]</span>")?;
-                    continue
+                if self.in_attribute {
+                    self.in_attribute = false;
+                    out.string("]", Class::None, None)?;
+                    out.exit_span()?;
+                    return Ok(());
                } else {
-                    ""
+                    Class::None
                }
            }

            token::Literal(lit, _suf) => {
                match lit {
-                    // text literals
+                    // Text literals.
                    token::Byte(..) | token::Char(..) |
                        token::ByteStr(..) | token::ByteStrRaw(..) |
-                        token::Str_(..) | token::StrRaw(..) => "string",
+                        token::Str_(..) | token::StrRaw(..) => Class::String,

-                    // number literals
-                    token::Integer(..) | token::Float(..) => "number",
+                    // Number literals.
+                    token::Integer(..) | token::Float(..) => Class::Number,
                }
            }

-            // keywords are also included in the identifier set
+            // Keywords are also included in the identifier set.
            token::Ident(ident) => {
                match &*ident.name.as_str() {
-                    "ref" | "mut" => "kw-2",
+                    "ref" | "mut" => Class::RefKeyWord,

-                    "self" => "self",
-                    "false" | "true" => "boolval",
+                    "self" |"Self" => Class::Self_,
+                    "false" | "true" => Class::Bool,

-                    "Option" | "Result" => "prelude-ty",
-                    "Some" | "None" | "Ok" | "Err" => "prelude-val",
+                    "Option" | "Result" => Class::PreludeTy,
+                    "Some" | "None" | "Ok" | "Err" => Class::PreludeVal,

-                    _ if next.tok.is_any_keyword() => "kw",
+                    _ if tas.tok.is_any_keyword() => Class::KeyWord,
                    _ => {
-                        if is_macro_nonterminal {
-                            is_macro_nonterminal = false;
-                            "macro-nonterminal"
-                        } else if lexer.peek().tok == token::Not {
-                            is_macro = true;
-                            "macro"
+                        if self.in_macro_nonterminal {
+                            self.in_macro_nonterminal = false;
+                            Class::MacroNonTerminal
+                        } else if self.lexer.peek().tok == token::Not {
+                            self.in_macro = true;
+                            Class::Macro
                        } else {
-                            "ident"
+                            Class::Ident
                        }
                    }
                }
            }

-            // Special macro vars are like keywords
-            token::SpecialVarNt(_) => "kw-2",
+            // Special macro vars are like keywords.
+            token::SpecialVarNt(_) => Class::KeyWord,
+
+            token::Lifetime(..) => Class::Lifetime,

-            token::Lifetime(..) => "lifetime",
-            token::DocComment(..) => "doccomment",
            token::Underscore | token::Eof | token::Interpolated(..) |
-                token::MatchNt(..) | token::SubstNt(..) => "",
+            token::MatchNt(..) | token::SubstNt(..) | token::Tilde | token::At => Class::None,
        };

-        // as mentioned above, use the original source code instead of
-        // stringifying this token
-        let snip = sess.codemap().span_to_snippet(next.sp).unwrap();
-        if klass == "" {
-            write!(out, "{}", Escape(&snip))?;
-        } else {
-            write!(out, "<span class='{}'>{}</span>", klass, Escape(&snip))?;
-        }
+        // Anything that didn't return above is the simple case where we the
+        // class just spans a single token, so we can use the `string` method.
+        out.string(Escape(&self.snip(tas.sp)), klass, Some(&tas))
    }

-    Ok(())
+    // Helper function to get a snippet from the codemap.
+    fn snip(&self, sp: Span) -> String {
+        self.codemap.span_to_snippet(sp).unwrap()
+    }
+}
+
+impl Class {
+    /// Returns the css class expected by rustdoc for each `Class`.
+    pub fn rustdoc_class(self) -> &'static str {
+        match self {
+            Class::None => "",
+            Class::Comment => "comment",
+            Class::DocComment => "doccomment",
+            Class::Attribute => "attribute",
+            Class::KeyWord => "kw",
+            Class::RefKeyWord => "kw-2",
+            Class::Self_ => "self",
+            Class::Op => "op",
+            Class::Macro => "macro",
+            Class::MacroNonTerminal => "macro-nonterminal",
+            Class::String => "string",
+            Class::Number => "number",
+            Class::Bool => "boolvalue",
+            Class::Ident => "ident",
+            Class::Lifetime => "lifetime",
+            Class::PreludeTy => "prelude-ty",
+            Class::PreludeVal => "prelude-val",
+        }
+    }
 }

 fn write_header(class: Option<&str>,