From 776590b14edbc3bc840d419035ca8deaadaeeda1 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Thu, 8 Feb 2024 08:35:12 -0800 Subject: [PATCH] Switch linkchecker to use html5ever for html parsing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing regex-based HTML parsing was just too primitive to correctly handle HTML content. Some books have legitimate `href="…"` text which should not be validated because it is part of the text, not actual HTML. --- Cargo.lock | 1 + src/tools/linkchecker/Cargo.toml | 1 + src/tools/linkchecker/main.rs | 447 +++++++++++++++++-------------- 3 files changed, 244 insertions(+), 205 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 29e99c55743..bb8e8860904 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2274,6 +2274,7 @@ dependencies = [ name = "linkchecker" version = "0.1.0" dependencies = [ + "html5ever", "once_cell", "regex", ] diff --git a/src/tools/linkchecker/Cargo.toml b/src/tools/linkchecker/Cargo.toml index 1d8f2f91882..318a69ab835 100644 --- a/src/tools/linkchecker/Cargo.toml +++ b/src/tools/linkchecker/Cargo.toml @@ -10,3 +10,4 @@ path = "main.rs" [dependencies] regex = "1" once_cell = "1" +html5ever = "0.26.0" diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs index 7f73cac63cb..f49c6e79f13 100644 --- a/src/tools/linkchecker/main.rs +++ b/src/tools/linkchecker/main.rs @@ -14,6 +14,12 @@ //! A few exceptions are allowed as there's known bugs in rustdoc, but this //! should catch the majority of "broken link" cases. +use html5ever::tendril::ByteTendril; +use html5ever::tokenizer::{ + BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, +}; +use once_cell::sync::Lazy; +use regex::Regex; use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::env; @@ -23,9 +29,6 @@ use std::path::{Component, Path, PathBuf}; use std::rc::Rc; use std::time::Instant; -use once_cell::sync::Lazy; -use regex::Regex; - // Add linkcheck exceptions here // If at all possible you should use intra-doc links to avoid linkcheck issues. These // are cases where that does not work @@ -182,163 +185,10 @@ impl Checker { } }; - // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' - with_attrs_in_source(&source, " href", |url, i, base| { - // Ignore external URLs - if url.starts_with("http:") - || url.starts_with("https:") - || url.starts_with("javascript:") - || url.starts_with("ftp:") - || url.starts_with("irc:") - || url.starts_with("data:") - || url.starts_with("mailto:") - { - report.links_ignored_external += 1; - return; - } - report.links_checked += 1; - let (url, fragment) = match url.split_once('#') { - None => (url, None), - Some((url, fragment)) => (url, Some(fragment)), - }; - // NB: the `splitn` always succeeds, even if the delimiter is not present. - let url = url.splitn(2, '?').next().unwrap(); - - // Once we've plucked out the URL, parse it using our base url and - // then try to extract a file path. - let mut path = file.to_path_buf(); - if !base.is_empty() || !url.is_empty() { - path.pop(); - for part in Path::new(base).join(url).components() { - match part { - Component::Prefix(_) | Component::RootDir => { - // Avoid absolute paths as they make the docs not - // relocatable by making assumptions on where the docs - // are hosted relative to the site root. - report.errors += 1; - println!( - "{}:{}: absolute path - {}", - pretty_path, - i + 1, - Path::new(base).join(url).display() - ); - return; - } - Component::CurDir => {} - Component::ParentDir => { - path.pop(); - } - Component::Normal(s) => { - path.push(s); - } - } - } - } - - let (target_pretty_path, target_entry) = self.load_file(&path, report); - let (target_source, target_ids) = match target_entry { - FileEntry::Missing => { - if is_exception(file, &target_pretty_path) { - report.links_ignored_exception += 1; - } else { - report.errors += 1; - println!( - "{}:{}: broken link - `{}`", - pretty_path, - i + 1, - target_pretty_path - ); - } - return; - } - FileEntry::Dir => { - // Links to directories show as directory listings when viewing - // the docs offline so it's best to avoid them. - report.errors += 1; - println!( - "{}:{}: directory link to `{}` \ - (directory links should use index.html instead)", - pretty_path, - i + 1, - target_pretty_path - ); - return; - } - FileEntry::OtherFile => return, - FileEntry::Redirect { target } => { - let t = target.clone(); - let (target, redir_entry) = self.load_file(&t, report); - match redir_entry { - FileEntry::Missing => { - report.errors += 1; - println!( - "{}:{}: broken redirect from `{}` to `{}`", - pretty_path, - i + 1, - target_pretty_path, - target - ); - return; - } - FileEntry::Redirect { target } => { - // Redirect to a redirect, this link checker - // currently doesn't support this, since it would - // require cycle checking, etc. - report.errors += 1; - println!( - "{}:{}: redirect from `{}` to `{}` \ - which is also a redirect (not supported)", - pretty_path, - i + 1, - target_pretty_path, - target.display() - ); - return; - } - FileEntry::Dir => { - report.errors += 1; - println!( - "{}:{}: redirect from `{}` to `{}` \ - which is a directory \ - (directory links should use index.html instead)", - pretty_path, - i + 1, - target_pretty_path, - target - ); - return; - } - FileEntry::OtherFile => return, - FileEntry::HtmlFile { source, ids } => (source, ids), - } - } - FileEntry::HtmlFile { source, ids } => (source, ids), - }; - - // Alright, if we've found an HTML file for the target link. If - // this is a fragment link, also check that the `id` exists. - if let Some(ref fragment) = fragment { - // Fragments like `#1-6` are most likely line numbers to be - // interpreted by javascript, so we're ignoring these - if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { - return; - } - - parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report); - - if target_ids.borrow().contains(*fragment) { - return; - } - - if is_exception(file, &format!("#{}", fragment)) { - report.links_ignored_exception += 1; - } else { - report.errors += 1; - print!("{}:{}: broken link fragment ", pretty_path, i + 1); - println!("`#{}` pointing to `{}`", fragment, target_pretty_path); - }; - } - }); + let (base, urls) = get_urls(&source); + for (i, url) in urls { + self.check_url(file, &pretty_path, report, &base, i, &url); + } self.check_intra_doc_links(file, &pretty_path, &source, report); @@ -350,6 +200,159 @@ impl Checker { } } + fn check_url( + &mut self, + file: &Path, + pretty_path: &str, + report: &mut Report, + base: &Option, + i: u64, + url: &str, + ) { + // Ignore external URLs + if url.starts_with("http:") + || url.starts_with("https:") + || url.starts_with("javascript:") + || url.starts_with("ftp:") + || url.starts_with("irc:") + || url.starts_with("data:") + || url.starts_with("mailto:") + { + report.links_ignored_external += 1; + return; + } + report.links_checked += 1; + let (url, fragment) = match url.split_once('#') { + None => (url, None), + Some((url, fragment)) => (url, Some(fragment)), + }; + // NB: the `splitn` always succeeds, even if the delimiter is not present. + let url = url.splitn(2, '?').next().unwrap(); + + // Once we've plucked out the URL, parse it using our base url and + // then try to extract a file path. + let mut path = file.to_path_buf(); + if base.is_some() || !url.is_empty() { + let base = base.as_deref().unwrap_or(""); + path.pop(); + for part in Path::new(base).join(url).components() { + match part { + Component::Prefix(_) | Component::RootDir => { + // Avoid absolute paths as they make the docs not + // relocatable by making assumptions on where the docs + // are hosted relative to the site root. + report.errors += 1; + println!( + "{}:{}: absolute path - {}", + pretty_path, + i, + Path::new(base).join(url).display() + ); + return; + } + Component::CurDir => {} + Component::ParentDir => { + path.pop(); + } + Component::Normal(s) => { + path.push(s); + } + } + } + } + + let (target_pretty_path, target_entry) = self.load_file(&path, report); + let (target_source, target_ids) = match target_entry { + FileEntry::Missing => { + if is_exception(file, &target_pretty_path) { + report.links_ignored_exception += 1; + } else { + report.errors += 1; + println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path); + } + return; + } + FileEntry::Dir => { + // Links to directories show as directory listings when viewing + // the docs offline so it's best to avoid them. + report.errors += 1; + println!( + "{}:{}: directory link to `{}` \ + (directory links should use index.html instead)", + pretty_path, i, target_pretty_path + ); + return; + } + FileEntry::OtherFile => return, + FileEntry::Redirect { target } => { + let t = target.clone(); + let (target, redir_entry) = self.load_file(&t, report); + match redir_entry { + FileEntry::Missing => { + report.errors += 1; + println!( + "{}:{}: broken redirect from `{}` to `{}`", + pretty_path, i, target_pretty_path, target + ); + return; + } + FileEntry::Redirect { target } => { + // Redirect to a redirect, this link checker + // currently doesn't support this, since it would + // require cycle checking, etc. + report.errors += 1; + println!( + "{}:{}: redirect from `{}` to `{}` \ + which is also a redirect (not supported)", + pretty_path, + i, + target_pretty_path, + target.display() + ); + return; + } + FileEntry::Dir => { + report.errors += 1; + println!( + "{}:{}: redirect from `{}` to `{}` \ + which is a directory \ + (directory links should use index.html instead)", + pretty_path, i, target_pretty_path, target + ); + return; + } + FileEntry::OtherFile => return, + FileEntry::HtmlFile { source, ids } => (source, ids), + } + } + FileEntry::HtmlFile { source, ids } => (source, ids), + }; + + // Alright, if we've found an HTML file for the target link. If + // this is a fragment link, also check that the `id` exists. + if let Some(ref fragment) = fragment { + // Fragments like `#1-6` are most likely line numbers to be + // interpreted by javascript, so we're ignoring these + if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { + return; + } + + parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report); + + if target_ids.borrow().contains(*fragment) { + return; + } + + if is_exception(file, &format!("#{}", fragment)) { + report.links_ignored_exception += 1; + } else { + report.errors += 1; + print!("{}:{}: broken link fragment ", pretty_path, i); + println!("`#{}` pointing to `{}`", fragment, target_pretty_path); + }; + } + } + fn check_intra_doc_links( &mut self, file: &Path, @@ -496,59 +499,93 @@ fn maybe_redirect(source: &str) -> Option { find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK)) } -fn with_attrs_in_source(source: &str, attr: &str, mut f: F) { - let mut base = ""; - for (i, mut line) in source.lines().enumerate() { - while let Some(j) = line.find(attr) { - let rest = &line[j + attr.len()..]; - // The base tag should always be the first link in the document so - // we can get away with using one pass. - let is_base = line[..j].ends_with("(source: &str, sink: Sink) -> Sink { + let tendril: ByteTendril = source.as_bytes().into(); + let mut input = BufferQueue::new(); + input.push_back(tendril.try_reinterpret().unwrap()); - let rest = &rest[pos_equals + 1..]; + let mut tok = Tokenizer::new(sink, TokenizerOpts::default()); + let _ = tok.feed(&mut input); + assert!(input.is_empty()); + tok.end(); + tok.sink +} - let pos_quote = match rest.find(&['"', '\''][..]) { - Some(i) => i, - None => continue, - }; - let quote_delim = rest.as_bytes()[pos_quote] as char; +#[derive(Default)] +struct AttrCollector { + attr_name: &'static [u8], + base: Option, + found_attrs: Vec<(u64, String)>, + /// Tracks whether or not it is inside a