From 1f95a91c24d114637131a635fe71dfd1d457fe30 Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Fri, 21 Aug 2020 16:38:35 +0200
Subject: [PATCH 1/2] Remove dependency on StringReader from rustdoc
 highlighter

rustc_lexer is the lossless lexer, which is a better fit for
approximate syntax highlighting.

As a side-effect, we can now syntax-highlight even broken code.
---
 src/librustdoc/html/highlight.rs         | 622 ++++++++++-------------
 src/librustdoc/html/highlight/tests.rs   |  24 +-
 src/test/rustdoc/bad-codeblock-syntax.rs |  12 +-
 3 files changed, 268 insertions(+), 390 deletions(-)
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index d4302d0cb54..26557fc1cb7 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -7,18 +7,12 @@
 
 use crate::html::escape::Escape;
 
-use std::fmt::Display;
-use std::io;
-use std::io::prelude::*;
+use std::fmt::{Display, Write};
+use std::iter::Peekable;
 
-use rustc_ast::token::{self, Token};
-use rustc_data_structures::sync::Lrc;
-use rustc_parse::lexer;
-use rustc_session::parse::ParseSess;
-use rustc_span::hygiene::SyntaxContext;
-use rustc_span::source_map::SourceMap;
-use rustc_span::symbol::{kw, sym};
-use rustc_span::{BytePos, FileName, SourceFile, Span};
+use rustc_lexer::{LiteralKind, TokenKind};
+use rustc_span::symbol::Ident;
+use rustc_span::with_default_session_globals;
 
 /// Highlights `src`, returning the HTML output.
 pub fn render_with_highlighting(
@@ -28,7 +22,7 @@ pub fn render_with_highlighting(
     tooltip: Option<(&str, &str)>,
 ) -> String {
     debug!("highlighting: ================\n{}\n==============", src);
-    let mut out = Vec::new();
+    let mut out = String::with_capacity(src.len());
     if let Some((tooltip, class)) = tooltip {
         write!(
             out,
@@ -39,60 +33,30 @@ pub fn render_with_highlighting(
         .unwrap();
     }
 
-    let sess = ParseSess::with_silent_emitter();
-    let source_file = sess
-        .source_map()
-        .new_source_file(FileName::Custom(String::from("rustdoc-highlighting")), src);
+    write_header(&mut out, class);
+    write_code(&mut out, &src);
+    write_footer(&mut out, playground_button);
 
-    let classifier_source_file = Lrc::clone(&source_file);
-    let highlight_result = rustc_driver::catch_fatal_errors(|| {
-        let mut classifier = Classifier::new(&sess, classifier_source_file);
-
-        let mut highlighted_source = vec![];
-        if classifier.write_source(&mut highlighted_source).is_err() {
-            Err(())
-        } else {
-            Ok(String::from_utf8_lossy(&highlighted_source).into_owned())
-        }
-    })
-    .unwrap_or(Err(()));
-
-    match highlight_result {
-        Ok(highlighted_source) => {
-            write_header(class, &mut out).unwrap();
-            write!(out, "{}", highlighted_source).unwrap();
-            write_footer(&mut out, playground_button).unwrap();
-        }
-        Err(()) => {
-            // Get the source back out of the source map to avoid a copy in the happy path.
-            let span =
-                Span::new(BytePos(0), BytePos(source_file.byte_length()), SyntaxContext::root());
-            let src = sess
-                .source_map()
-                .span_to_snippet(span)
-                .expect("could not retrieve snippet from artificial source file");
-
-            // If errors are encountered while trying to highlight, just emit
-            // the unhighlighted source.
-            write!(out, "<pre><code>{}</code></pre>", Escape(&src)).unwrap();
-        }
-    }
-
-    String::from_utf8_lossy(&out[..]).into_owned()
+    out
 }
 
-/// Processes a program (nested in the internal `lexer`), classifying strings of
-/// text by highlighting category (`Class`). Calls out to a `Writer` to write
-/// each span of text in sequence.
-struct Classifier<'sess> {
-    lexer: lexer::StringReader<'sess>,
-    peek_token: Option<Token>,
-    source_map: &'sess SourceMap,
+fn write_header(out: &mut String, class: Option<&str>) {
+    write!(out, "<div class=\"example-wrap\"><pre class=\"rust {}\">\n", class.unwrap_or_default())
+        .unwrap()
+}
 
-    // State of the classifier.
-    in_attribute: bool,
-    in_macro: bool,
-    in_macro_nonterminal: bool,
+fn write_code(out: &mut String, src: &str) {
+    Classifier::new(src).highlight(&mut |highlight| {
+        match highlight {
+            Highlight::Token { text, class } => string(out, Escape(text), class),
+            Highlight::EnterSpan { class } => enter_span(out, class),
+            Highlight::ExitSpan => exit_span(out),
+        };
+    });
+}
+
+fn write_footer(out: &mut String, playground_button: Option<&str>) {
+    write!(out, "</pre>{}</div>\n", playground_button.unwrap_or_default()).unwrap()
 }
 
 /// How a span of text is classified. Mostly corresponds to token kinds.
@@ -119,306 +83,9 @@ enum Class {
     QuestionMark,
 }
 
-/// Trait that controls writing the output of syntax highlighting. Users should
-/// implement this trait to customize writing output.
-///
-/// The classifier will call into the `Writer` implementation as it finds spans
-/// of text to highlight. Exactly how that text should be highlighted is up to
-/// the implementation.
-trait Writer {
-    /// Called when we start processing a span of text that should be highlighted.
-    /// The `Class` argument specifies how it should be highlighted.
-    fn enter_span(&mut self, _: Class) -> io::Result<()>;
-
-    /// Called at the end of a span of highlighted text.
-    fn exit_span(&mut self) -> io::Result<()>;
-
-    /// Called for a span of text. If the text should be highlighted differently from the
-    /// surrounding text, then the `Class` argument will be a value other than `None`.
-    ///
-    /// The following sequences of callbacks are equivalent:
-    /// ```plain
-    ///     enter_span(Foo), string("text", None), exit_span()
-    ///     string("text", Foo)
-    /// ```
-    /// The latter can be thought of as a shorthand for the former, which is
-    /// more flexible.
-    fn string<T: Display>(&mut self, text: T, klass: Class) -> io::Result<()>;
-}
-
-// Implement `Writer` for anything that can be written to, this just implements
-// the default rustdoc behaviour.
-impl<U: Write> Writer for U {
-    fn string<T: Display>(&mut self, text: T, klass: Class) -> io::Result<()> {
-        match klass {
-            Class::None => write!(self, "{}", text),
-            klass => write!(self, "<span class=\"{}\">{}</span>", klass.rustdoc_class(), text),
-        }
-    }
-
-    fn enter_span(&mut self, klass: Class) -> io::Result<()> {
-        write!(self, "<span class=\"{}\">", klass.rustdoc_class())
-    }
-
-    fn exit_span(&mut self) -> io::Result<()> {
-        write!(self, "</span>")
-    }
-}
-
-#[derive(Debug)]
-enum HighlightError {
-    LexError,
-    IoError(io::Error),
-}
-
-impl From<io::Error> for HighlightError {
-    fn from(err: io::Error) -> Self {
-        HighlightError::IoError(err)
-    }
-}
-
-impl<'sess> Classifier<'sess> {
-    fn new(sess: &ParseSess, source_file: Lrc<SourceFile>) -> Classifier<'_> {
-        let lexer = lexer::StringReader::new(sess, source_file, None);
-
-        Classifier {
-            lexer,
-            peek_token: None,
-            source_map: sess.source_map(),
-            in_attribute: false,
-            in_macro: false,
-            in_macro_nonterminal: false,
-        }
-    }
-
-    /// Gets the next token out of the lexer.
-    fn try_next_token(&mut self) -> Result<Token, HighlightError> {
-        if let Some(token) = self.peek_token.take() {
-            return Ok(token);
-        }
-        let token = self.lexer.next_token();
-        if let token::Unknown(..) = &token.kind {
-            return Err(HighlightError::LexError);
-        }
-        Ok(token)
-    }
-
-    fn peek(&mut self) -> Result<&Token, HighlightError> {
-        if self.peek_token.is_none() {
-            let token = self.lexer.next_token();
-            if let token::Unknown(..) = &token.kind {
-                return Err(HighlightError::LexError);
-            }
-            self.peek_token = Some(token);
-        }
-        Ok(self.peek_token.as_ref().unwrap())
-    }
-
-    /// Exhausts the `lexer` writing the output into `out`.
-    ///
-    /// The general structure for this method is to iterate over each token,
-    /// possibly giving it an HTML span with a class specifying what flavor of token
-    /// is used. All source code emission is done as slices from the source map,
-    /// not from the tokens themselves, in order to stay true to the original
-    /// source.
-    fn write_source<W: Writer>(&mut self, out: &mut W) -> Result<(), HighlightError> {
-        loop {
-            let mut next = self.try_next_token()?;
-            if next == token::Eof {
-                break;
-            }
-
-            // Glue any tokens that need to be glued.
-            if let Some(joint) = next.glue(self.peek()?) {
-                next = joint;
-                let _ = self.try_next_token()?;
-            }
-
-            self.write_token(out, next)?;
-        }
-
-        Ok(())
-    }
-
-    // Handles an individual token from the lexer.
-    fn write_token<W: Writer>(&mut self, out: &mut W, token: Token) -> Result<(), HighlightError> {
-        let klass = match token.kind {
-            token::Shebang(s) => {
-                out.string(Escape(&s.as_str()), Class::None)?;
-                return Ok(());
-            }
-
-            token::Whitespace | token::Unknown(..) => Class::None,
-            token::Comment => Class::Comment,
-            token::DocComment(..) => Class::DocComment,
-
-            // If this '&' or '*' token is followed by a non-whitespace token, assume that it's the
-            // reference or dereference operator or a reference or pointer type, instead of the
-            // bit-and or multiplication operator.
-            token::BinOp(token::And | token::Star) if self.peek()? != &token::Whitespace => {
-                Class::RefKeyWord
-            }
-
-            // Consider this as part of a macro invocation if there was a
-            // leading identifier.
-            token::Not if self.in_macro => {
-                self.in_macro = false;
-                Class::Macro
-            }
-
-            // Operators.
-            token::Eq
-            | token::Lt
-            | token::Le
-            | token::EqEq
-            | token::Ne
-            | token::Ge
-            | token::Gt
-            | token::AndAnd
-            | token::OrOr
-            | token::Not
-            | token::BinOp(..)
-            | token::RArrow
-            | token::BinOpEq(..)
-            | token::FatArrow => Class::Op,
-
-            // Miscellaneous, no highlighting.
-            token::Dot
-            | token::DotDot
-            | token::DotDotDot
-            | token::DotDotEq
-            | token::Comma
-            | token::Semi
-            | token::Colon
-            | token::ModSep
-            | token::LArrow
-            | token::OpenDelim(_)
-            | token::CloseDelim(token::Brace | token::Paren | token::NoDelim) => Class::None,
-
-            token::Question => Class::QuestionMark,
-
-            token::Dollar => {
-                if self.peek()?.is_ident() {
-                    self.in_macro_nonterminal = true;
-                    Class::MacroNonTerminal
-                } else {
-                    Class::None
-                }
-            }
-
-            // This might be the start of an attribute. We're going to want to
-            // continue highlighting it as an attribute until the ending ']' is
-            // seen, so skip out early. Down below we terminate the attribute
-            // span when we see the ']'.
-            token::Pound => {
-                // We can't be sure that our # begins an attribute (it could
-                // just be appearing in a macro) until we read either `#![` or
-                // `#[` from the input stream.
-                //
-                // We don't want to start highlighting as an attribute until
-                // we're confident there is going to be a ] coming up, as
-                // otherwise # tokens in macros highlight the rest of the input
-                // as an attribute.
-
-                // Case 1: #![inner_attribute]
-                if self.peek()? == &token::Not {
-                    self.try_next_token()?; // NOTE: consumes `!` token!
-                    if self.peek()? == &token::OpenDelim(token::Bracket) {
-                        self.in_attribute = true;
-                        out.enter_span(Class::Attribute)?;
-                    }
-                    out.string("#", Class::None)?;
-                    out.string("!", Class::None)?;
-                    return Ok(());
-                }
-
-                // Case 2: #[outer_attribute]
-                if self.peek()? == &token::OpenDelim(token::Bracket) {
-                    self.in_attribute = true;
-                    out.enter_span(Class::Attribute)?;
-                }
-                out.string("#", Class::None)?;
-                return Ok(());
-            }
-            token::CloseDelim(token::Bracket) => {
-                if self.in_attribute {
-                    self.in_attribute = false;
-                    out.string("]", Class::None)?;
-                    out.exit_span()?;
-                    return Ok(());
-                } else {
-                    Class::None
-                }
-            }
-
-            token::Literal(lit) => {
-                match lit.kind {
-                    // Text literals.
-                    token::Byte
-                    | token::Char
-                    | token::Err
-                    | token::ByteStr
-                    | token::ByteStrRaw(..)
-                    | token::Str
-                    | token::StrRaw(..) => Class::String,
-
-                    // Number literals.
-                    token::Integer | token::Float => Class::Number,
-
-                    token::Bool => panic!("literal token contains `Lit::Bool`"),
-                }
-            }
-
-            // Keywords are also included in the identifier set.
-            token::Ident(name, is_raw) => match name {
-                kw::Ref | kw::Mut if !is_raw => Class::RefKeyWord,
-
-                kw::SelfLower | kw::SelfUpper => Class::Self_,
-                kw::False | kw::True if !is_raw => Class::Bool,
-
-                sym::Option | sym::Result => Class::PreludeTy,
-                sym::Some | sym::None | sym::Ok | sym::Err => Class::PreludeVal,
-
-                _ if token.is_reserved_ident() => Class::KeyWord,
-
-                _ => {
-                    if self.in_macro_nonterminal {
-                        self.in_macro_nonterminal = false;
-                        Class::MacroNonTerminal
-                    } else if self.peek()? == &token::Not {
-                        self.in_macro = true;
-                        Class::Macro
-                    } else {
-                        Class::Ident
-                    }
-                }
-            },
-
-            token::Lifetime(..) => Class::Lifetime,
-
-            token::Eof
-            | token::Interpolated(..)
-            | token::Tilde
-            | token::At
-            | token::SingleQuote => Class::None,
-        };
-
-        // Anything that didn't return above is the simple case where we the
-        // class just spans a single token, so we can use the `string` method.
-        out.string(Escape(&self.snip(token.span)), klass)?;
-
-        Ok(())
-    }
-
-    // Helper function to get a snippet from the source_map.
-    fn snip(&self, sp: Span) -> String {
-        self.source_map.span_to_snippet(sp).unwrap()
-    }
-}
-
 impl Class {
     /// Returns the css class expected by rustdoc for each `Class`.
-    fn rustdoc_class(self) -> &'static str {
+    fn as_html(self) -> &'static str {
         match self {
             Class::None => "",
             Class::Comment => "comment",
@@ -442,12 +109,239 @@ impl Class {
     }
 }
 
-fn write_header(class: Option<&str>, out: &mut dyn Write) -> io::Result<()> {
-    write!(out, "<div class=\"example-wrap\"><pre class=\"rust {}\">\n", class.unwrap_or(""))
+enum Highlight<'a> {
+    Token { text: &'a str, class: Class },
+    EnterSpan { class: Class },
+    ExitSpan,
 }
 
-fn write_footer(out: &mut dyn Write, playground_button: Option<&str>) -> io::Result<()> {
-    write!(out, "</pre>{}</div>\n", if let Some(button) = playground_button { button } else { "" })
+struct TokenIter<'a> {
+    src: &'a str,
+}
+
+impl Iterator for TokenIter<'a> {
+    type Item = (TokenKind, &'a str);
+    fn next(&mut self) -> Option<(TokenKind, &'a str)> {
+        if self.src.is_empty() {
+            return None;
+        }
+        let token = rustc_lexer::first_token(self.src);
+        let (text, rest) = self.src.split_at(token.len);
+        self.src = rest;
+        Some((token.kind, text))
+    }
+}
+
+/// Processes program tokens, classifying strings of text by highlighting
+/// category (`Class`).
+struct Classifier<'a> {
+    tokens: Peekable<TokenIter<'a>>,
+    in_attribute: bool,
+    in_macro: bool,
+    in_macro_nonterminal: bool,
+}
+
+impl<'a> Classifier<'a> {
+    fn new(src: &str) -> Classifier<'_> {
+        let tokens = TokenIter { src }.peekable();
+        Classifier { tokens, in_attribute: false, in_macro: false, in_macro_nonterminal: false }
+    }
+
+    /// Exhausts the `Classifier` writing the output into `sink`.
+    ///
+    /// The general structure for this method is to iterate over each token,
+    /// possibly giving it an HTML span with a class specifying what flavor of
+    /// token is used.
+    fn highlight(mut self, sink: &mut dyn FnMut(Highlight<'a>)) {
+        with_default_session_globals(|| {
+            while let Some((token, text)) = self.tokens.next() {
+                self.advance(token, text, sink);
+            }
+        })
+    }
+
+    /// Single step of highlighting. This will classify `token`, but maybe also
+    /// a couple of following ones as well.
+    fn advance(&mut self, token: TokenKind, text: &'a str, sink: &mut dyn FnMut(Highlight<'a>)) {
+        let lookahead = self.peek();
+        let class = match token {
+            TokenKind::Whitespace => Class::None,
+            TokenKind::LineComment { doc_style } | TokenKind::BlockComment { doc_style, .. } => {
+                if doc_style.is_some() {
+                    Class::DocComment
+                } else {
+                    Class::Comment
+                }
+            }
+            // Consider this as part of a macro invocation if there was a
+            // leading identifier.
+            TokenKind::Bang if self.in_macro => {
+                self.in_macro = false;
+                Class::Macro
+            }
+
+            // Assume that '&' or '*' is the reference or dereference operator
+            // or a reference or pointer type. Unless, of course, it looks like
+            // a logical and or a multiplication operator: `&&` or `* `.
+            TokenKind::Star => match lookahead {
+                Some(TokenKind::Whitespace) => Class::Op,
+                _ => Class::RefKeyWord,
+            },
+            TokenKind::And => match lookahead {
+                Some(TokenKind::And) => {
+                    let _and = self.tokens.next();
+                    sink(Highlight::Token { text: "&&", class: Class::Op });
+                    return;
+                }
+                Some(TokenKind::Eq) => {
+                    let _eq = self.tokens.next();
+                    sink(Highlight::Token { text: "&=", class: Class::Op });
+                    return;
+                }
+                Some(TokenKind::Whitespace) => Class::Op,
+                _ => Class::RefKeyWord,
+            },
+
+            // Operators.
+            TokenKind::Minus
+            | TokenKind::Plus
+            | TokenKind::Or
+            | TokenKind::Slash
+            | TokenKind::Caret
+            | TokenKind::Percent
+            | TokenKind::Bang
+            | TokenKind::Eq
+            | TokenKind::Lt
+            | TokenKind::Gt => Class::Op,
+
+            // Miscellaneous, no highlighting.
+            TokenKind::Dot
+            | TokenKind::Semi
+            | TokenKind::Comma
+            | TokenKind::OpenParen
+            | TokenKind::CloseParen
+            | TokenKind::OpenBrace
+            | TokenKind::CloseBrace
+            | TokenKind::OpenBracket
+            | TokenKind::At
+            | TokenKind::Tilde
+            | TokenKind::Colon
+            | TokenKind::Unknown => Class::None,
+
+            TokenKind::Question => Class::QuestionMark,
+
+            TokenKind::Dollar => match lookahead {
+                Some(TokenKind::Ident) => {
+                    self.in_macro_nonterminal = true;
+                    Class::MacroNonTerminal
+                }
+                _ => Class::None,
+            },
+
+            // This might be the start of an attribute. We're going to want to
+            // continue highlighting it as an attribute until the ending ']' is
+            // seen, so skip out early. Down below we terminate the attribute
+            // span when we see the ']'.
+            TokenKind::Pound => {
+                match lookahead {
+                    // Case 1: #![inner_attribute]
+                    Some(TokenKind::Bang) => {
+                        let _not = self.tokens.next().unwrap();
+                        if let Some(TokenKind::OpenBracket) = self.peek() {
+                            self.in_attribute = true;
+                            sink(Highlight::EnterSpan { class: Class::Attribute });
+                        }
+                        sink(Highlight::Token { text: "#", class: Class::None });
+                        sink(Highlight::Token { text: "!", class: Class::None });
+                        return;
+                    }
+                    // Case 2: #[outer_attribute]
+                    Some(TokenKind::OpenBracket) => {
+                        self.in_attribute = true;
+                        sink(Highlight::EnterSpan { class: Class::Attribute });
+                    }
+                    _ => (),
+                }
+                Class::None
+            }
+            TokenKind::CloseBracket => {
+                if self.in_attribute {
+                    self.in_attribute = false;
+                    sink(Highlight::Token { text: "]", class: Class::None });
+                    sink(Highlight::ExitSpan);
+                    return;
+                }
+                Class::None
+            }
+            TokenKind::Literal { kind, .. } => match kind {
+                // Text literals.
+                LiteralKind::Byte { .. }
+                | LiteralKind::Char { .. }
+                | LiteralKind::Str { .. }
+                | LiteralKind::ByteStr { .. }
+                | LiteralKind::RawStr { .. }
+                | LiteralKind::RawByteStr { .. } => Class::String,
+                // Number literals.
+                LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
+            },
+            TokenKind::Ident | TokenKind::RawIdent if lookahead == Some(TokenKind::Bang) => {
+                self.in_macro = true;
+                Class::Macro
+            }
+            TokenKind::Ident => match text {
+                "ref" | "mut" => Class::RefKeyWord,
+                "self" | "Self" => Class::Self_,
+                "false" | "true" => Class::Bool,
+                "Option" | "Result" => Class::PreludeTy,
+                "Some" | "None" | "Ok" | "Err" => Class::PreludeVal,
+                // Keywords are also included in the identifier set.
+                _ if Ident::from_str(text).is_reserved() => Class::KeyWord,
+                _ if self.in_macro_nonterminal => {
+                    self.in_macro_nonterminal = false;
+                    Class::MacroNonTerminal
+                }
+                _ => Class::Ident,
+            },
+            TokenKind::RawIdent => Class::Ident,
+            TokenKind::Lifetime { .. } => Class::Lifetime,
+        };
+        // Anything that didn't return above is the simple case where we the
+        // class just spans a single token, so we can use the `string` method.
+        sink(Highlight::Token { text, class });
+    }
+
+    fn peek(&mut self) -> Option<TokenKind> {
+        self.tokens.peek().map(|(toke_kind, _text)| *toke_kind)
+    }
+}
+
+/// Called when we start processing a span of text that should be highlighted.
+/// The `Class` argument specifies how it should be highlighted.
+fn enter_span(out: &mut String, klass: Class) {
+    write!(out, "<span class=\"{}\">", klass.as_html()).unwrap()
+}
+
+/// Called at the end of a span of highlighted text.
+fn exit_span(out: &mut String) {
+    write!(out, "</span>").unwrap()
+}
+
+/// Called for a span of text. If the text should be highlighted differently
+/// from the surrounding text, then the `Class` argument will be a value other
+/// than `None`.
+///
+/// The following sequences of callbacks are equivalent:
+/// ```plain
+///     enter_span(Foo), string("text", None), exit_span()
+///     string("text", Foo)
+/// ```
+/// The latter can be thought of as a shorthand for the former, which is more
+/// flexible.
+fn string<T: Display>(out: &mut String, text: T, klass: Class) {
+    match klass {
+        Class::None => write!(out, "{}", text).unwrap(),
+        klass => write!(out, "<span class=\"{}\">{}</span>", klass.as_html(), text).unwrap(),
+    }
 }
 
 #[cfg(test)]
diff --git a/src/librustdoc/html/highlight/tests.rs b/src/librustdoc/html/highlight/tests.rs
index 21e476cbe06..756751e47e8 100644
--- a/src/librustdoc/html/highlight/tests.rs
+++ b/src/librustdoc/html/highlight/tests.rs
@@ -1,25 +1,9 @@
-use rustc_session::parse::ParseSess;
-use rustc_span::edition::Edition;
-use rustc_span::with_session_globals;
-use rustc_span::FileName;
-
-use super::Classifier;
+use super::write_code;
 
 fn highlight(src: &str) -> String {
-    let mut out = vec![];
-
-    with_session_globals(Edition::Edition2018, || {
-        let sess = ParseSess::with_silent_emitter();
-        let source_file = sess.source_map().new_source_file(
-            FileName::Custom(String::from("rustdoc-highlighting")),
-            src.to_owned(),
-        );
-
-        let mut classifier = Classifier::new(&sess, source_file);
-        classifier.write_source(&mut out).unwrap();
-    });
-
-    String::from_utf8(out).unwrap()
+    let mut out = String::new();
+    write_code(&mut out, src);
+    out
 }
 
 #[test]
diff --git a/src/test/rustdoc/bad-codeblock-syntax.rs b/src/test/rustdoc/bad-codeblock-syntax.rs
index afef86ec9c7..920877028d0 100644
--- a/src/test/rustdoc/bad-codeblock-syntax.rs
+++ b/src/test/rustdoc/bad-codeblock-syntax.rs
@@ -1,33 +1,33 @@
 // @has bad_codeblock_syntax/fn.foo.html
-// @has - '//*[@class="docblock"]/pre/code' '\_'
+// @has - '//*[@class="docblock"]' '\_'
 /// ```
 /// \_
 /// ```
 pub fn foo() {}
 
 // @has bad_codeblock_syntax/fn.bar.html
-// @has - '//*[@class="docblock"]/pre/code' '`baz::foobar`'
+// @has - '//*[@class="docblock"]' '`baz::foobar`'
 /// ```
 /// `baz::foobar`
 /// ```
 pub fn bar() {}
 
 // @has bad_codeblock_syntax/fn.quux.html
-// @has - '//*[@class="docblock"]/pre/code' '\_'
+// @has - '//*[@class="docblock"]' '\_'
 /// ```rust
 /// \_
 /// ```
 pub fn quux() {}
 
 // @has bad_codeblock_syntax/fn.ok.html
-// @has - '//*[@class="docblock"]/pre/code[@class="language-text"]' '\_'
+// @has - '//*[@class="docblock"]' '\_'
 /// ```text
 /// \_
 /// ```
 pub fn ok() {}
 
 // @has bad_codeblock_syntax/fn.escape.html
-// @has - '//*[@class="docblock"]/pre/code' '\_ <script>alert("not valid Rust");</script>'
+// @has - '//*[@class="docblock"]' '\_ <script>alert("not valid Rust");</script>'
 /// ```
 /// \_
 /// <script>alert("not valid Rust");</script>
@@ -35,7 +35,7 @@ pub fn ok() {}
 pub fn escape() {}
 
 // @has bad_codeblock_syntax/fn.unterminated.html
-// @has - '//*[@class="docblock"]/pre/code' '"unterminated'
+// @has - '//*[@class="docblock"]' '"unterminated'
 /// ```
 /// "unterminated
 /// ```

From b4f4db946e71c2bf4f2b324cc002d92068e2cff2 Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Thu, 27 Aug 2020 12:50:28 +0200
Subject: [PATCH 2/2] Add expect test for rustdoc html highlighting

It's a unit-test in a sense that it only checks syntax highlighting.
However, the resulting HTML is written to disk and can be easily
inspected in the browser.

To update the test, run with `--bless` argument or set
`UPDATE_EXPEC=1` env var
---
 Cargo.lock                                    |  1 +
 src/librustdoc/Cargo.toml                     |  3 +
 .../html/highlight/fixtures/sample.html       | 27 +++++++
 .../html/highlight/fixtures/sample.rs         | 16 ++++
 src/librustdoc/html/highlight/tests.rs        | 81 +++++--------------
 5 files changed, 67 insertions(+), 61 deletions(-)
 create mode 100644 src/librustdoc/html/highlight/fixtures/sample.html
 create mode 100644 src/librustdoc/html/highlight/fixtures/sample.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1bbae2cbd80..0ee4d41c647 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4122,6 +4122,7 @@ dependencies = [
 name = "rustdoc"
 version = "0.0.0"
 dependencies = [
+ "expect-test",
  "itertools 0.8.2",
  "minifier",
  "pulldown-cmark",
diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml
index 1354ef5cbde..09afb3cae5b 100644
--- a/src/librustdoc/Cargo.toml
+++ b/src/librustdoc/Cargo.toml
@@ -17,3 +17,6 @@ serde_json = "1.0"
 smallvec = "1.0"
 tempfile = "3"
 itertools = "0.8"
+
+[dev-dependencies]
+expect-test = "0.1"
diff --git a/src/librustdoc/html/highlight/fixtures/sample.html b/src/librustdoc/html/highlight/fixtures/sample.html
new file mode 100644
index 00000000000..d937246f466
--- /dev/null
+++ b/src/librustdoc/html/highlight/fixtures/sample.html
@@ -0,0 +1,27 @@
+
+<style>
+.kw { color: #8959A8; }
+.kw-2, .prelude-ty { color: #4271AE; }
+.number, .string { color: #718C00; }
+.self, .bool-val, .prelude-val, .attribute, .attribute .ident { color: #C82829; }
+.macro, .macro-nonterminal { color: #3E999F; }
+.lifetime { color: #B76514; }
+.question-mark { color: #ff9011; }
+</style>
+<pre><code><span class="attribute">#![<span class="ident">crate_type</span> <span class="op">=</span> <span class="string">&quot;lib&quot;</span>]</span>
+
+<span class="attribute">#[<span class="ident">cfg</span>(<span class="ident">target_os</span> <span class="op">=</span> <span class="string">&quot;linux&quot;</span>)]</span>
+<span class="kw">fn</span> <span class="ident">main</span>() {
+    <span class="kw">let</span> <span class="ident">foo</span> <span class="op">=</span> <span class="bool-val">true</span> <span class="op">&amp;&amp;</span> <span class="bool-val">false</span> <span class="op">|</span><span class="op">|</span> <span class="bool-val">true</span>;
+    <span class="kw">let</span> <span class="kw">_</span>: <span class="kw-2">*</span><span class="kw">const</span> () <span class="op">=</span> <span class="number">0</span>;
+    <span class="kw">let</span> <span class="kw">_</span> <span class="op">=</span> <span class="kw-2">&amp;</span><span class="ident">foo</span>;
+    <span class="kw">let</span> <span class="kw">_</span> <span class="op">=</span> <span class="op">&amp;&amp;</span><span class="ident">foo</span>;
+    <span class="kw">let</span> <span class="kw">_</span> <span class="op">=</span> <span class="kw-2">*</span><span class="ident">foo</span>;
+    <span class="macro">mac</span><span class="macro">!</span>(<span class="ident">foo</span>, <span class="kw-2">&amp;</span><span class="kw-2">mut</span> <span class="ident">bar</span>);
+    <span class="macro">assert</span><span class="macro">!</span>(<span class="self">self</span>.<span class="ident">length</span> <span class="op">&lt;</span> <span class="ident">N</span> <span class="op">&amp;&amp;</span> <span class="ident">index</span> <span class="op">&lt;</span><span class="op">=</span> <span class="self">self</span>.<span class="ident">length</span>);
+}
+
+<span class="macro">macro_rules</span><span class="macro">!</span> <span class="ident">bar</span> {
+    (<span class="macro-nonterminal">$</span><span class="macro-nonterminal">foo</span>:<span class="ident">tt</span>) <span class="op">=</span><span class="op">&gt;</span> {};
+}
+</code></pre>
diff --git a/src/librustdoc/html/highlight/fixtures/sample.rs b/src/librustdoc/html/highlight/fixtures/sample.rs
new file mode 100644
index 00000000000..956fdbe090b
--- /dev/null
+++ b/src/librustdoc/html/highlight/fixtures/sample.rs
@@ -0,0 +1,16 @@
+#![crate_type = "lib"]
+
+#[cfg(target_os = "linux")]
+fn main() {
+    let foo = true && false || true;
+    let _: *const () = 0;
+    let _ = &foo;
+    let _ = &&foo;
+    let _ = *foo;
+    mac!(foo, &mut bar);
+    assert!(self.length < N && index <= self.length);
+}
+
+macro_rules! bar {
+    ($foo:tt) => {};
+}
diff --git a/src/librustdoc/html/highlight/tests.rs b/src/librustdoc/html/highlight/tests.rs
index 756751e47e8..398cd4f670e 100644
--- a/src/librustdoc/html/highlight/tests.rs
+++ b/src/librustdoc/html/highlight/tests.rs
@@ -1,66 +1,25 @@
 use super::write_code;
-
-fn highlight(src: &str) -> String {
-    let mut out = String::new();
-    write_code(&mut out, src);
-    out
-}
+use expect_test::expect_file;
 
 #[test]
-fn function() {
-    assert_eq!(
-        highlight("fn main() {}"),
-        r#"<span class="kw">fn</span> <span class="ident">main</span>() {}"#,
-    );
+fn test_html_highlighting() {
+    let src = include_str!("fixtures/sample.rs");
+    let html = {
+        let mut out = String::new();
+        write_code(&mut out, src);
+        format!("{}<pre><code>{}</code></pre>\n", STYLE, out)
+    };
+    expect_file!["src/librustdoc/html/highlight/fixtures/sample.html"].assert_eq(&html);
 }
 
-#[test]
-fn statement() {
-    assert_eq!(
-        highlight("let foo = true;"),
-        concat!(
-            r#"<span class="kw">let</span> <span class="ident">foo</span> "#,
-            r#"<span class="op">=</span> <span class="bool-val">true</span>;"#,
-        ),
-    );
-}
-
-#[test]
-fn inner_attr() {
-    assert_eq!(
-        highlight(r##"#![crate_type = "lib"]"##),
-        concat!(
-            r##"<span class="attribute">#![<span class="ident">crate_type</span> "##,
-            r##"<span class="op">=</span> <span class="string">&quot;lib&quot;</span>]</span>"##,
-        ),
-    );
-}
-
-#[test]
-fn outer_attr() {
-    assert_eq!(
-        highlight(r##"#[cfg(target_os = "linux")]"##),
-        concat!(
-            r##"<span class="attribute">#[<span class="ident">cfg</span>("##,
-            r##"<span class="ident">target_os</span> <span class="op">=</span> "##,
-            r##"<span class="string">&quot;linux&quot;</span>)]</span>"##,
-        ),
-    );
-}
-
-#[test]
-fn mac() {
-    assert_eq!(
-        highlight("mac!(foo bar)"),
-        concat!(
-            r#"<span class="macro">mac</span><span class="macro">!</span>("#,
-            r#"<span class="ident">foo</span> <span class="ident">bar</span>)"#,
-        ),
-    );
-}
-
-// Regression test for #72684
-#[test]
-fn andand() {
-    assert_eq!(highlight("&&"), r#"<span class="op">&amp;&amp;</span>"#);
-}
+const STYLE: &str = r#"
+<style>
+.kw { color: #8959A8; }
+.kw-2, .prelude-ty { color: #4271AE; }
+.number, .string { color: #718C00; }
+.self, .bool-val, .prelude-val, .attribute, .attribute .ident { color: #C82829; }
+.macro, .macro-nonterminal { color: #3E999F; }
+.lifetime { color: #B76514; }
+.question-mark { color: #ff9011; }
+</style>
+"#;