src/tools/linkchecker/main.rs - toolchain/rustc - Git at Google

 //! Script to check the validity of `href` links in our HTML documentation.
 //!
 //! In the past we've been quite error prone to writing in broken links as most
 //! of them are manually rather than automatically added. As files move over
 //! time or apis change old links become stale or broken. The purpose of this
 //! script is to check all relative links in our documentation to make sure they
 //! actually point to a valid place.
 //!
 //! Currently uses a combination of HTML parsing to
 //! extract the `href` and `id` attributes,
 //! and regex search on the orignal markdown to handle intra-doc links.
 //!
 //! These values are then translated to file URLs if possible and then the
 //! destination is asserted to exist.
 //!
 //! A few exceptions are allowed as there's known bugs in rustdoc, but this
 //! should catch the majority of "broken link" cases.

 use std::cell::RefCell;
 use std::collections::{HashMap, HashSet};
 use std::io::ErrorKind;
 use std::path::{Component, Path, PathBuf};
 use std::rc::Rc;
 use std::time::Instant;
 use std::{env, fs};

 use html5ever::tendril::ByteTendril;
 use html5ever::tokenizer::{
     BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
 };

 // Add linkcheck exceptions here
 // If at all possible you should use intra-doc links to avoid linkcheck issues. These
 // are cases where that does not work
 // [(generated_documentation_page, &[broken_links])]
 #[rustfmt::skip]
 const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[
     // These try to link to std::collections, but are defined in alloc
     // https://github.com/rust-lang/rust/issues/74481
     ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
     ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
     ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
     ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),

     // These try to link to various things in std, but are defined in core.
     // The docs in std::primitive use proper intra-doc links, so these seem fine to special-case.
     // Most these are broken because liballoc uses `#[lang_item]` magic to define things on
     // primitives that aren't available in core.
     ("alloc/slice/trait.Join.html", &["#method.join"]),
     ("alloc/slice/trait.Concat.html", &["#method.concat"]),
     ("alloc/slice/index.html", &["#method.concat", "#method.join"]),
     ("alloc/vec/struct.Vec.html", &["#method.sort_by_key", "#method.sort_by_cached_key"]),
     ("core/primitive.str.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase"]),
     ("core/primitive.slice.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase",
                                     "core/slice::sort_by_key", "core\\slice::sort_by_key",
                                     "#method.sort_by_cached_key"]),
 ];

 #[rustfmt::skip]
 const INTRA_DOC_LINK_EXCEPTIONS: &[(&str, &[&str])] = &[
     // This is being used in the sense of 'inclusive range', not a markdown link
     ("core/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
     ("std/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
     ("core/range/legacy/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
     ("std/range/legacy/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
     ("core/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
     ("alloc/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
     ("std/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
     ("core/primitive.str.html", &["begin</code>, <code>end"]),
     ("std/primitive.str.html", &["begin</code>, <code>end"]),

 ];

 macro_rules! static_regex {
     ($re:literal) => {{
         static RE: ::std::sync::OnceLock<::regex::Regex> = ::std::sync::OnceLock::new();
         RE.get_or_init(|| ::regex::Regex::new($re).unwrap())
     }};
 }

 macro_rules! t {
     ($e:expr) => {
         match $e {
             Ok(e) => e,
             Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
         }
     };
 }

 fn main() {
     let docs = env::args_os().nth(1).expect("doc path should be first argument");
     let docs = env::current_dir().unwrap().join(docs);
     let mut checker = Checker { root: docs.clone(), cache: HashMap::new() };
     let mut report = Report {
         errors: 0,
         start: Instant::now(),
         html_files: 0,
         html_redirects: 0,
         links_checked: 0,
         links_ignored_external: 0,
         links_ignored_exception: 0,
         intra_doc_exceptions: 0,
     };
     checker.walk(&docs, &mut report);
     report.report();
     if report.errors != 0 {
         println!("found some broken links");
         std::process::exit(1);
     }
 }

 struct Checker {
     root: PathBuf,
     cache: Cache,
 }

 struct Report {
     errors: u32,
     start: Instant,
     html_files: u32,
     html_redirects: u32,
     links_checked: u32,
     links_ignored_external: u32,
     links_ignored_exception: u32,
     intra_doc_exceptions: u32,
 }

 /// A cache entry.
 enum FileEntry {
     /// An HTML file.
     ///
     /// This includes the contents of the HTML file, and an optional set of
     /// HTML IDs. The IDs are used for checking fragments. They are computed
     /// as-needed. The source is discarded (replaced with an empty string)
     /// after the file has been checked, to conserve on memory.
     HtmlFile { source: Rc<String>, ids: RefCell<HashSet<String>> },
     /// This file is an HTML redirect to the given local path.
     Redirect { target: PathBuf },
     /// This is not an HTML file.
     OtherFile,
     /// This is a directory.
     Dir,
     /// The file doesn't exist.
     Missing,
 }

 /// A cache to speed up file access.
 type Cache = HashMap<String, FileEntry>;

 fn small_url_encode(s: &str) -> String {
     s.replace('<', "%3C")
         .replace('>', "%3E")
         .replace(' ', "%20")
         .replace('?', "%3F")
         .replace('\'', "%27")
         .replace('&', "%26")
         .replace(',', "%2C")
         .replace(':', "%3A")
         .replace(';', "%3B")
         .replace('[', "%5B")
         .replace(']', "%5D")
         .replace('\"', "%22")
 }

 impl Checker {
     /// Primary entry point for walking the filesystem to find HTML files to check.
     fn walk(&mut self, dir: &Path, report: &mut Report) {
         for entry in t!(dir.read_dir()).map(|e| t!(e)) {
             let path = entry.path();
             // Goes through symlinks
             let metadata = t!(fs::metadata(&path));
             if metadata.is_dir() {
                 self.walk(&path, report);
             } else {
                 self.check(&path, report);
             }
         }
     }

     /// Checks a single file.
     fn check(&mut self, file: &Path, report: &mut Report) {
         let (pretty_path, entry) = self.load_file(file, report);
         let source = match entry {
             FileEntry::Missing => panic!("missing file {:?} while walking", file),
             FileEntry::Dir => unreachable!("never with `check` path"),
             FileEntry::OtherFile => return,
             FileEntry::Redirect { .. } => return,
             FileEntry::HtmlFile { source, ids } => {
                 parse_ids(&mut ids.borrow_mut(), &pretty_path, source, report);
                 source.clone()
             }
         };

         let (base, urls) = get_urls(&source);
         for (i, url) in urls {
             self.check_url(file, &pretty_path, report, &base, i, &url);
         }

         self.check_intra_doc_links(file, &pretty_path, &source, report);

         // we don't need the source anymore,
         // so drop to reduce memory-usage
         match self.cache.get_mut(&pretty_path).unwrap() {
             FileEntry::HtmlFile { source, .. } => *source = Rc::new(String::new()),
             _ => unreachable!("must be html file"),
         }
     }

     fn check_url(
         &mut self,
         file: &Path,
         pretty_path: &str,
         report: &mut Report,
         base: &Option<String>,
         i: u64,
         url: &str,
     ) {
         // Ignore external URLs
         if url.starts_with("http:")
             || url.starts_with("https:")
             || url.starts_with("javascript:")
             || url.starts_with("ftp:")
             || url.starts_with("irc:")
             || url.starts_with("data:")
             || url.starts_with("mailto:")
         {
             report.links_ignored_external += 1;
             return;
         }
         report.links_checked += 1;
         let (url, fragment) = match url.split_once('#') {
             None => (url, None),
             Some((url, fragment)) => (url, Some(fragment)),
         };
         // NB: the `splitn` always succeeds, even if the delimiter is not present.
         let url = url.splitn(2, '?').next().unwrap();

         // Once we've plucked out the URL, parse it using our base url and
         // then try to extract a file path.
         let mut path = file.to_path_buf();
         if base.is_some() || !url.is_empty() {
             let base = base.as_deref().unwrap_or("");
             path.pop();
             for part in Path::new(base).join(url).components() {
                 match part {
                     Component::Prefix(_) | Component::RootDir => {
                         // Avoid absolute paths as they make the docs not
                         // relocatable by making assumptions on where the docs
                         // are hosted relative to the site root.
                         report.errors += 1;
                         println!(
                             "{}:{}: absolute path - {}",
                             pretty_path,
                             i,
                             Path::new(base).join(url).display()
                         );
                         return;
                     }
                     Component::CurDir => {}
                     Component::ParentDir => {
                         path.pop();
                     }
                     Component::Normal(s) => {
                         path.push(s);
                     }
                 }
             }
         }

         let (target_pretty_path, target_entry) = self.load_file(&path, report);
         let (target_source, target_ids) = match target_entry {
             FileEntry::Missing => {
                 if is_exception(file, &target_pretty_path) {
                     report.links_ignored_exception += 1;
                 } else {
                     report.errors += 1;
                     println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path);
                 }
                 return;
             }
             FileEntry::Dir => {
                 // Links to directories show as directory listings when viewing
                 // the docs offline so it's best to avoid them.
                 report.errors += 1;
                 println!(
                     "{}:{}: directory link to `{}` \
                          (directory links should use index.html instead)",
                     pretty_path, i, target_pretty_path
                 );
                 return;
             }
             FileEntry::OtherFile => return,
             FileEntry::Redirect { target } => {
                 let t = target.clone();
                 let (target, redir_entry) = self.load_file(&t, report);
                 match redir_entry {
                     FileEntry::Missing => {
                         report.errors += 1;
                         println!(
                             "{}:{}: broken redirect from `{}` to `{}`",
                             pretty_path, i, target_pretty_path, target
                         );
                         return;
                     }
                     FileEntry::Redirect { target } => {
                         // Redirect to a redirect, this link checker
                         // currently doesn't support this, since it would
                         // require cycle checking, etc.
                         report.errors += 1;
                         println!(
                             "{}:{}: redirect from `{}` to `{}` \
                                  which is also a redirect (not supported)",
                             pretty_path,
                             i,
                             target_pretty_path,
                             target.display()
                         );
                         return;
                     }
                     FileEntry::Dir => {
                         report.errors += 1;
                         println!(
                             "{}:{}: redirect from `{}` to `{}` \
                                  which is a directory \
                                  (directory links should use index.html instead)",
                             pretty_path, i, target_pretty_path, target
                         );
                         return;
                     }
                     FileEntry::OtherFile => return,
                     FileEntry::HtmlFile { source, ids } => (source, ids),
                 }
             }
             FileEntry::HtmlFile { source, ids } => (source, ids),
         };

         // Alright, if we've found an HTML file for the target link. If
         // this is a fragment link, also check that the `id` exists.
         if let Some(ref fragment) = fragment {
             // Fragments like `#1-6` are most likely line numbers to be
             // interpreted by javascript, so we're ignoring these
             if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
                 return;
             }

             parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);

             if target_ids.borrow().contains(*fragment) {
                 return;
             }

             if is_exception(file, &format!("#{}", fragment)) {
                 report.links_ignored_exception += 1;
             } else {
                 report.errors += 1;
                 print!("{}:{}: broken link fragment ", pretty_path, i);
                 println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
             };
         }
     }

     fn check_intra_doc_links(
         &mut self,
         file: &Path,
         pretty_path: &str,
         source: &str,
         report: &mut Report,
     ) {
         let relative = file.strip_prefix(&self.root).expect("should always be relative to root");
         // Don't check the reference. It has several legitimate things that
         // look like [<code>…</code>]. The reference has its own broken link
         // checker in its CI which handles this using pulldown_cmark.
         //
         // This checks both the end of the root (when checking just the
         // reference directory) or the beginning (when checking all docs).
         if self.root.ends_with("reference") || relative.starts_with("reference") {
             return;
         }
         // Search for intra-doc links that rustdoc didn't warn about
         // NOTE: only looks at one line at a time; in practice this should find most links
         for (i, line) in source.lines().enumerate() {
             for broken_link in static_regex!(r#"\[<code>(.*)</code>\]"#).captures_iter(line) {
                 if is_intra_doc_exception(file, &broken_link[1]) {
                     report.intra_doc_exceptions += 1;
                 } else {
                     report.errors += 1;
                     print!("{}:{}: broken intra-doc link - ", pretty_path, i + 1);
                     println!("{}", &broken_link[0]);
                 }
             }
         }
     }

     /// Load a file from disk, or from the cache if available.
     fn load_file(&mut self, file: &Path, report: &mut Report) -> (String, &FileEntry) {
         // https://docs.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
         #[cfg(windows)]
         const ERROR_INVALID_NAME: i32 = 123;

         let pretty_path =
             file.strip_prefix(&self.root).unwrap_or(file).to_str().unwrap().to_string();

         let entry =
             self.cache.entry(pretty_path.clone()).or_insert_with(|| match fs::metadata(file) {
                 Ok(metadata) if metadata.is_dir() => FileEntry::Dir,
                 Ok(_) => {
                     if file.extension().and_then(|s| s.to_str()) != Some("html") {
                         FileEntry::OtherFile
                     } else {
                         report.html_files += 1;
                         load_html_file(file, report)
                     }
                 }
                 Err(e) if e.kind() == ErrorKind::NotFound => FileEntry::Missing,
                 Err(e) => {
                     // If a broken intra-doc link contains `::`, on windows, it will cause `ERROR_INVALID_NAME` rather than `NotFound`.
                     // Explicitly check for that so that the broken link can be allowed in `LINKCHECK_EXCEPTIONS`.
                     #[cfg(windows)]
                     if e.raw_os_error() == Some(ERROR_INVALID_NAME)
                         && file.as_os_str().to_str().map_or(false, |s| s.contains("::"))
                     {
                         return FileEntry::Missing;
                     }
                     panic!("unexpected read error for {}: {}", file.display(), e);
                 }
             });
         (pretty_path, entry)
     }
 }

 impl Report {
     fn report(&self) {
         println!("checked links in: {:.1}s", self.start.elapsed().as_secs_f64());
         println!("number of HTML files scanned: {}", self.html_files);
         println!("number of HTML redirects found: {}", self.html_redirects);
         println!("number of links checked: {}", self.links_checked);
         println!("number of links ignored due to external: {}", self.links_ignored_external);
         println!("number of links ignored due to exceptions: {}", self.links_ignored_exception);
         println!("number of intra doc links ignored: {}", self.intra_doc_exceptions);
         println!("errors found: {}", self.errors);
     }
 }

 fn load_html_file(file: &Path, report: &mut Report) -> FileEntry {
     let source = match fs::read_to_string(file) {
         Ok(s) => Rc::new(s),
         Err(err) => {
             // This usually should not fail since `metadata` was already
             // called successfully on this file.
             panic!("unexpected read error for {}: {}", file.display(), err);
         }
     };
     match maybe_redirect(&source) {
         Some(target) => {
             report.html_redirects += 1;
             let target = file.parent().unwrap().join(target);
             FileEntry::Redirect { target }
         }
         None => FileEntry::HtmlFile { source: source.clone(), ids: RefCell::new(HashSet::new()) },
     }
 }

 fn is_intra_doc_exception(file: &Path, link: &str) -> bool {
     if let Some(entry) = INTRA_DOC_LINK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) {
         entry.1.is_empty() || entry.1.contains(&link)
     } else {
         false
     }
 }

 fn is_exception(file: &Path, link: &str) -> bool {
     if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) {
         entry.1.contains(&link)
     } else {
         // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page
         //
         // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path
         // calculated in `check` function is outside `build/<triple>/doc` dir.
         // So the `strip_prefix` method just returns the old absolute broken path.
         if file.ends_with("std/primitive.slice.html") && link.ends_with("primitive.slice.html") {
             return true;
         }
         false
     }
 }

 /// If the given HTML file contents is an HTML redirect, this returns the
 /// destination path given in the redirect.
 fn maybe_redirect(source: &str) -> Option<String> {
     const REDIRECT_RUSTDOC: (usize, &str) = (7, "<p>Redirecting to <a href=");
     const REDIRECT_MDBOOK: (usize, &str) = (8 - 7, "<p>Redirecting to... <a href=");

     let mut lines = source.lines();

     let mut find_redirect = |(line_rel, redirect_pattern): (usize, &str)| {
         let redirect_line = lines.nth(line_rel)?;

         redirect_line.find(redirect_pattern).map(|i| {
             let rest = &redirect_line[(i + redirect_pattern.len() + 1)..];
             let pos_quote = rest.find('"').unwrap();
             rest[..pos_quote].to_owned()
         })
     };

     find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK))
 }

 fn parse_html<Sink: TokenSink>(source: &str, sink: Sink) -> Sink {
     let tendril: ByteTendril = source.as_bytes().into();
     let mut input = BufferQueue::default();
     input.push_back(tendril.try_reinterpret().unwrap());

     let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
     let _ = tok.feed(&mut input);
     assert!(input.is_empty());
     tok.end();
     tok.sink
 }

 #[derive(Default)]
 struct AttrCollector {
     attr_name: &'static [u8],
     base: Option<String>,
     found_attrs: Vec<(u64, String)>,
     /// Tracks whether or not it is inside a <script> tag.
     ///
     /// A lot of our sources have JSON script tags which have HTML embedded
     /// within, but that cannot be parsed or processed correctly (since it is
     /// JSON, not HTML). I think the sink is supposed to return
     /// `TokenSinkResult::Script(…)` (and then maybe switch parser?), but I
     /// don't fully understand the best way to use that, and this seems good
     /// enough for now.
     in_script: bool,
 }

 impl TokenSink for AttrCollector {
     type Handle = ();

     fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> {
         match token {
             TagToken(tag) => {
                 let tag_name = tag.name.as_bytes();
                 if tag_name == b"base" {
                     if let Some(href) =
                         tag.attrs.iter().find(|attr| attr.name.local.as_bytes() == b"href")
                     {
                         self.base = Some(href.value.to_string());
                     }
                     return TokenSinkResult::Continue;
                 } else if tag_name == b"script" {
                     self.in_script = !self.in_script;
                 }
                 if self.in_script {
                     return TokenSinkResult::Continue;
                 }
                 for attr in tag.attrs.iter() {
                     let name = attr.name.local.as_bytes();
                     if name == self.attr_name {
                         let url = attr.value.to_string();
                         self.found_attrs.push((line_number, url));
                     }
                 }
             }
             // Note: ParseError is pretty noisy. It seems html5ever does not
             // particularly like some kinds of HTML comments.
             _ => {}
         }
         TokenSinkResult::Continue
     }
 }

 /// Retrieves href="..." attributes from HTML elements.
 fn get_urls(source: &str) -> (Option<String>, Vec<(u64, String)>) {
     let collector = AttrCollector { attr_name: b"href", ..AttrCollector::default() };
     let sink = parse_html(source, collector);
     (sink.base, sink.found_attrs)
 }

 /// Retrieves id="..." attributes from HTML elements.
 fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) {
     if !ids.is_empty() {
         // ids have already been parsed
         return;
     }

     let collector = AttrCollector { attr_name: b"id", ..AttrCollector::default() };
     let sink = parse_html(source, collector);
     for (line_number, id) in sink.found_attrs {
         let encoded = small_url_encode(&id);
         if let Some(id) = ids.replace(id) {
             report.errors += 1;
             println!("{}:{}: id is not unique: `{}`", file, line_number, id);
         }
         // Just in case, we also add the encoded id.
         ids.insert(encoded);
     }
 }
	//! Script to check the validity of `href` links in our HTML documentation.
	//!
	//! In the past we've been quite error prone to writing in broken links as most
	//! of them are manually rather than automatically added. As files move over
	//! time or apis change old links become stale or broken. The purpose of this
	//! script is to check all relative links in our documentation to make sure they
	//! actually point to a valid place.
	//!
	//! Currently uses a combination of HTML parsing to
	//! extract the `href` and `id` attributes,
	//! and regex search on the orignal markdown to handle intra-doc links.
	//!
	//! These values are then translated to file URLs if possible and then the
	//! destination is asserted to exist.
	//!
	//! A few exceptions are allowed as there's known bugs in rustdoc, but this
	//! should catch the majority of "broken link" cases.

	use std::cell::RefCell;
	use std::collections::{HashMap, HashSet};
	use std::io::ErrorKind;
	use std::path::{Component, Path, PathBuf};
	use std::rc::Rc;
	use std::time::Instant;
	use std::{env, fs};

	use html5ever::tendril::ByteTendril;
	use html5ever::tokenizer::{
	BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
	};

	// Add linkcheck exceptions here
	// If at all possible you should use intra-doc links to avoid linkcheck issues. These
	// are cases where that does not work
	// [(generated_documentation_page, &[broken_links])]
	#[rustfmt::skip]
	const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[
	// These try to link to std::collections, but are defined in alloc
	// https://github.com/rust-lang/rust/issues/74481
	("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
	("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
	("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
	("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),

	// These try to link to various things in std, but are defined in core.
	// The docs in std::primitive use proper intra-doc links, so these seem fine to special-case.
	// Most these are broken because liballoc uses `#[lang_item]` magic to define things on
	// primitives that aren't available in core.
	("alloc/slice/trait.Join.html", &["#method.join"]),
	("alloc/slice/trait.Concat.html", &["#method.concat"]),
	("alloc/slice/index.html", &["#method.concat", "#method.join"]),
	("alloc/vec/struct.Vec.html", &["#method.sort_by_key", "#method.sort_by_cached_key"]),
	("core/primitive.str.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase"]),
	("core/primitive.slice.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase",
	"core/slice::sort_by_key", "core\\slice::sort_by_key",
	"#method.sort_by_cached_key"]),
	];

	#[rustfmt::skip]
	const INTRA_DOC_LINK_EXCEPTIONS: &[(&str, &[&str])] = &[
	// This is being used in the sense of 'inclusive range', not a markdown link
	("core/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
	("std/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
	("core/range/legacy/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
	("std/range/legacy/struct.RangeInclusive.html", &["begin</code>, <code>end"]),
	("core/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
	("alloc/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
	("std/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]),
	("core/primitive.str.html", &["begin</code>, <code>end"]),
	("std/primitive.str.html", &["begin</code>, <code>end"]),

	];

	macro_rules! static_regex {
	($re:literal) => {{
	static RE: ::std::sync::OnceLock<::regex::Regex> = ::std::sync::OnceLock::new();
	RE.get_or_init(\|\| ::regex::Regex::new($re).unwrap())
	}};
	}

	macro_rules! t {
	($e:expr) => {
	match $e {
	Ok(e) => e,
	Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
	}
	};
	}

	fn main() {
	let docs = env::args_os().nth(1).expect("doc path should be first argument");
	let docs = env::current_dir().unwrap().join(docs);
	let mut checker = Checker { root: docs.clone(), cache: HashMap::new() };
	let mut report = Report {
	errors: 0,
	start: Instant::now(),
	html_files: 0,
	html_redirects: 0,
	links_checked: 0,
	links_ignored_external: 0,
	links_ignored_exception: 0,
	intra_doc_exceptions: 0,
	};
	checker.walk(&docs, &mut report);
	report.report();
	if report.errors != 0 {
	println!("found some broken links");
	std::process::exit(1);
	}
	}

	struct Checker {
	root: PathBuf,
	cache: Cache,
	}

	struct Report {
	errors: u32,
	start: Instant,
	html_files: u32,
	html_redirects: u32,
	links_checked: u32,
	links_ignored_external: u32,
	links_ignored_exception: u32,
	intra_doc_exceptions: u32,
	}

	/// A cache entry.
	enum FileEntry {
	/// An HTML file.
	///
	/// This includes the contents of the HTML file, and an optional set of
	/// HTML IDs. The IDs are used for checking fragments. They are computed
	/// as-needed. The source is discarded (replaced with an empty string)
	/// after the file has been checked, to conserve on memory.
	HtmlFile { source: Rc<String>, ids: RefCell<HashSet<String>> },
	/// This file is an HTML redirect to the given local path.
	Redirect { target: PathBuf },
	/// This is not an HTML file.
	OtherFile,
	/// This is a directory.
	Dir,
	/// The file doesn't exist.
	Missing,
	}

	/// A cache to speed up file access.
	type Cache = HashMap<String, FileEntry>;

	fn small_url_encode(s: &str) -> String {
	s.replace('<', "%3C")
	.replace('>', "%3E")
	.replace(' ', "%20")
	.replace('?', "%3F")
	.replace('\'', "%27")
	.replace('&', "%26")
	.replace(',', "%2C")
	.replace(':', "%3A")
	.replace(';', "%3B")
	.replace('[', "%5B")
	.replace(']', "%5D")
	.replace('\"', "%22")
	}

	impl Checker {
	/// Primary entry point for walking the filesystem to find HTML files to check.
	fn walk(&mut self, dir: &Path, report: &mut Report) {
	for entry in t!(dir.read_dir()).map(\|e\| t!(e)) {
	let path = entry.path();
	// Goes through symlinks
	let metadata = t!(fs::metadata(&path));
	if metadata.is_dir() {
	self.walk(&path, report);
	} else {
	self.check(&path, report);
	}
	}
	}

	/// Checks a single file.
	fn check(&mut self, file: &Path, report: &mut Report) {
	let (pretty_path, entry) = self.load_file(file, report);
	let source = match entry {
	FileEntry::Missing => panic!("missing file {:?} while walking", file),
	FileEntry::Dir => unreachable!("never with `check` path"),
	FileEntry::OtherFile => return,
	FileEntry::Redirect { .. } => return,
	FileEntry::HtmlFile { source, ids } => {
	parse_ids(&mut ids.borrow_mut(), &pretty_path, source, report);
	source.clone()
	}
	};

	let (base, urls) = get_urls(&source);
	for (i, url) in urls {
	self.check_url(file, &pretty_path, report, &base, i, &url);
	}

	self.check_intra_doc_links(file, &pretty_path, &source, report);

	// we don't need the source anymore,
	// so drop to reduce memory-usage
	match self.cache.get_mut(&pretty_path).unwrap() {
	FileEntry::HtmlFile { source, .. } => *source = Rc::new(String::new()),
	_ => unreachable!("must be html file"),
	}
	}

	fn check_url(
	&mut self,
	file: &Path,
	pretty_path: &str,
	report: &mut Report,
	base: &Option<String>,
	i: u64,
	url: &str,
	) {
	// Ignore external URLs
	if url.starts_with("http:")
	\|\| url.starts_with("https:")
	\|\| url.starts_with("javascript:")
	\|\| url.starts_with("ftp:")
	\|\| url.starts_with("irc:")
	\|\| url.starts_with("data:")
	\|\| url.starts_with("mailto:")
	{
	report.links_ignored_external += 1;
	return;
	}
	report.links_checked += 1;
	let (url, fragment) = match url.split_once('#') {
	None => (url, None),
	Some((url, fragment)) => (url, Some(fragment)),
	};
	// NB: the `splitn` always succeeds, even if the delimiter is not present.
	let url = url.splitn(2, '?').next().unwrap();

	// Once we've plucked out the URL, parse it using our base url and
	// then try to extract a file path.
	let mut path = file.to_path_buf();
	if base.is_some() \|\| !url.is_empty() {
	let base = base.as_deref().unwrap_or("");
	path.pop();
	for part in Path::new(base).join(url).components() {
	match part {
	Component::Prefix(_) \| Component::RootDir => {
	// Avoid absolute paths as they make the docs not
	// relocatable by making assumptions on where the docs
	// are hosted relative to the site root.
	report.errors += 1;
	println!(
	"{}:{}: absolute path - {}",
	pretty_path,
	i,
	Path::new(base).join(url).display()
	);
	return;
	}
	Component::CurDir => {}
	Component::ParentDir => {
	path.pop();
	}
	Component::Normal(s) => {
	path.push(s);
	}
	}
	}
	}

	let (target_pretty_path, target_entry) = self.load_file(&path, report);
	let (target_source, target_ids) = match target_entry {
	FileEntry::Missing => {
	if is_exception(file, &target_pretty_path) {
	report.links_ignored_exception += 1;
	} else {
	report.errors += 1;
	println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path);
	}
	return;
	}
	FileEntry::Dir => {
	// Links to directories show as directory listings when viewing
	// the docs offline so it's best to avoid them.
	report.errors += 1;
	println!(
	"{}:{}: directory link to `{}` \
	(directory links should use index.html instead)",
	pretty_path, i, target_pretty_path
	);
	return;
	}
	FileEntry::OtherFile => return,
	FileEntry::Redirect { target } => {
	let t = target.clone();
	let (target, redir_entry) = self.load_file(&t, report);
	match redir_entry {
	FileEntry::Missing => {
	report.errors += 1;
	println!(
	"{}:{}: broken redirect from `{}` to `{}`",
	pretty_path, i, target_pretty_path, target
	);
	return;
	}
	FileEntry::Redirect { target } => {
	// Redirect to a redirect, this link checker
	// currently doesn't support this, since it would
	// require cycle checking, etc.
	report.errors += 1;
	println!(
	"{}:{}: redirect from `{}` to `{}` \
	which is also a redirect (not supported)",
	pretty_path,
	i,
	target_pretty_path,
	target.display()
	);
	return;
	}
	FileEntry::Dir => {
	report.errors += 1;
	println!(
	"{}:{}: redirect from `{}` to `{}` \
	which is a directory \
	(directory links should use index.html instead)",
	pretty_path, i, target_pretty_path, target
	);
	return;
	}
	FileEntry::OtherFile => return,
	FileEntry::HtmlFile { source, ids } => (source, ids),
	}
	}
	FileEntry::HtmlFile { source, ids } => (source, ids),
	};

	// Alright, if we've found an HTML file for the target link. If
	// this is a fragment link, also check that the `id` exists.
	if let Some(ref fragment) = fragment {
	// Fragments like `#1-6` are most likely line numbers to be
	// interpreted by javascript, so we're ignoring these
	if fragment.splitn(2, '-').all(\|f\| f.chars().all(\|c\| c.is_numeric())) {
	return;
	}

	parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report);

	if target_ids.borrow().contains(*fragment) {
	return;
	}

	if is_exception(file, &format!("#{}", fragment)) {
	report.links_ignored_exception += 1;
	} else {
	report.errors += 1;
	print!("{}:{}: broken link fragment ", pretty_path, i);
	println!("`#{}` pointing to `{}`", fragment, target_pretty_path);
	};
	}
	}

	fn check_intra_doc_links(
	&mut self,
	file: &Path,
	pretty_path: &str,
	source: &str,
	report: &mut Report,
	) {
	let relative = file.strip_prefix(&self.root).expect("should always be relative to root");
	// Don't check the reference. It has several legitimate things that
	// look like [<code>…</code>]. The reference has its own broken link
	// checker in its CI which handles this using pulldown_cmark.
	//
	// This checks both the end of the root (when checking just the
	// reference directory) or the beginning (when checking all docs).
	if self.root.ends_with("reference") \|\| relative.starts_with("reference") {
	return;
	}
	// Search for intra-doc links that rustdoc didn't warn about
	// NOTE: only looks at one line at a time; in practice this should find most links
	for (i, line) in source.lines().enumerate() {
	for broken_link in static_regex!(r#"\[<code>(.*)</code>\]"#).captures_iter(line) {
	if is_intra_doc_exception(file, &broken_link[1]) {
	report.intra_doc_exceptions += 1;
	} else {
	report.errors += 1;
	print!("{}:{}: broken intra-doc link - ", pretty_path, i + 1);
	println!("{}", &broken_link[0]);
	}
	}
	}
	}

	/// Load a file from disk, or from the cache if available.
	fn load_file(&mut self, file: &Path, report: &mut Report) -> (String, &FileEntry) {
	// https://docs.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
	#[cfg(windows)]
	const ERROR_INVALID_NAME: i32 = 123;

	let pretty_path =
	file.strip_prefix(&self.root).unwrap_or(file).to_str().unwrap().to_string();

	let entry =
	self.cache.entry(pretty_path.clone()).or_insert_with(\|\| match fs::metadata(file) {
	Ok(metadata) if metadata.is_dir() => FileEntry::Dir,
	Ok(_) => {
	if file.extension().and_then(\|s\| s.to_str()) != Some("html") {
	FileEntry::OtherFile
	} else {
	report.html_files += 1;
	load_html_file(file, report)
	}
	}
	Err(e) if e.kind() == ErrorKind::NotFound => FileEntry::Missing,
	Err(e) => {
	// If a broken intra-doc link contains `::`, on windows, it will cause `ERROR_INVALID_NAME` rather than `NotFound`.
	// Explicitly check for that so that the broken link can be allowed in `LINKCHECK_EXCEPTIONS`.
	#[cfg(windows)]
	if e.raw_os_error() == Some(ERROR_INVALID_NAME)
	&& file.as_os_str().to_str().map_or(false, \|s\| s.contains("::"))
	{
	return FileEntry::Missing;
	}
	panic!("unexpected read error for {}: {}", file.display(), e);
	}
	});
	(pretty_path, entry)
	}
	}

	impl Report {
	fn report(&self) {
	println!("checked links in: {:.1}s", self.start.elapsed().as_secs_f64());
	println!("number of HTML files scanned: {}", self.html_files);
	println!("number of HTML redirects found: {}", self.html_redirects);
	println!("number of links checked: {}", self.links_checked);
	println!("number of links ignored due to external: {}", self.links_ignored_external);
	println!("number of links ignored due to exceptions: {}", self.links_ignored_exception);
	println!("number of intra doc links ignored: {}", self.intra_doc_exceptions);
	println!("errors found: {}", self.errors);
	}
	}

	fn load_html_file(file: &Path, report: &mut Report) -> FileEntry {
	let source = match fs::read_to_string(file) {
	Ok(s) => Rc::new(s),
	Err(err) => {
	// This usually should not fail since `metadata` was already
	// called successfully on this file.
	panic!("unexpected read error for {}: {}", file.display(), err);
	}
	};
	match maybe_redirect(&source) {
	Some(target) => {
	report.html_redirects += 1;
	let target = file.parent().unwrap().join(target);
	FileEntry::Redirect { target }
	}
	None => FileEntry::HtmlFile { source: source.clone(), ids: RefCell::new(HashSet::new()) },
	}
	}

	fn is_intra_doc_exception(file: &Path, link: &str) -> bool {
	if let Some(entry) = INTRA_DOC_LINK_EXCEPTIONS.iter().find(\|&(f, _)\| file.ends_with(f)) {
	entry.1.is_empty() \|\| entry.1.contains(&link)
	} else {
	false
	}
	}

	fn is_exception(file: &Path, link: &str) -> bool {
	if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(\|&(f, _)\| file.ends_with(f)) {
	entry.1.contains(&link)
	} else {
	// FIXME(#63351): Concat trait in alloc/slice reexported in primitive page
	//
	// NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path
	// calculated in `check` function is outside `build/<triple>/doc` dir.
	// So the `strip_prefix` method just returns the old absolute broken path.
	if file.ends_with("std/primitive.slice.html") && link.ends_with("primitive.slice.html") {
	return true;
	}
	false
	}
	}

	/// If the given HTML file contents is an HTML redirect, this returns the
	/// destination path given in the redirect.
	fn maybe_redirect(source: &str) -> Option<String> {
	const REDIRECT_RUSTDOC: (usize, &str) = (7, "<p>Redirecting to <a href=");
	const REDIRECT_MDBOOK: (usize, &str) = (8 - 7, "<p>Redirecting to... <a href=");

	let mut lines = source.lines();

	let mut find_redirect = \|(line_rel, redirect_pattern): (usize, &str)\| {
	let redirect_line = lines.nth(line_rel)?;

	redirect_line.find(redirect_pattern).map(\|i\| {
	let rest = &redirect_line[(i + redirect_pattern.len() + 1)..];
	let pos_quote = rest.find('"').unwrap();
	rest[..pos_quote].to_owned()
	})
	};

	find_redirect(REDIRECT_RUSTDOC).or_else(\|\| find_redirect(REDIRECT_MDBOOK))
	}

	fn parse_html<Sink: TokenSink>(source: &str, sink: Sink) -> Sink {
	let tendril: ByteTendril = source.as_bytes().into();
	let mut input = BufferQueue::default();
	input.push_back(tendril.try_reinterpret().unwrap());

	let mut tok = Tokenizer::new(sink, TokenizerOpts::default());
	let _ = tok.feed(&mut input);
	assert!(input.is_empty());
	tok.end();
	tok.sink
	}

	#[derive(Default)]
	struct AttrCollector {
	attr_name: &'static [u8],
	base: Option<String>,
	found_attrs: Vec<(u64, String)>,
	/// Tracks whether or not it is inside a <script> tag.
	///
	/// A lot of our sources have JSON script tags which have HTML embedded
	/// within, but that cannot be parsed or processed correctly (since it is
	/// JSON, not HTML). I think the sink is supposed to return
	/// `TokenSinkResult::Script(…)` (and then maybe switch parser?), but I
	/// don't fully understand the best way to use that, and this seems good
	/// enough for now.
	in_script: bool,
	}

	impl TokenSink for AttrCollector {
	type Handle = ();

	fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<()> {
	match token {
	TagToken(tag) => {
	let tag_name = tag.name.as_bytes();
	if tag_name == b"base" {
	if let Some(href) =
	tag.attrs.iter().find(\|attr\| attr.name.local.as_bytes() == b"href")
	{
	self.base = Some(href.value.to_string());
	}
	return TokenSinkResult::Continue;
	} else if tag_name == b"script" {
	self.in_script = !self.in_script;
	}
	if self.in_script {
	return TokenSinkResult::Continue;
	}
	for attr in tag.attrs.iter() {
	let name = attr.name.local.as_bytes();
	if name == self.attr_name {
	let url = attr.value.to_string();
	self.found_attrs.push((line_number, url));
	}
	}
	}
	// Note: ParseError is pretty noisy. It seems html5ever does not
	// particularly like some kinds of HTML comments.
	_ => {}
	}
	TokenSinkResult::Continue
	}
	}

	/// Retrieves href="..." attributes from HTML elements.
	fn get_urls(source: &str) -> (Option<String>, Vec<(u64, String)>) {
	let collector = AttrCollector { attr_name: b"href", ..AttrCollector::default() };
	let sink = parse_html(source, collector);
	(sink.base, sink.found_attrs)
	}

	/// Retrieves id="..." attributes from HTML elements.
	fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) {
	if !ids.is_empty() {
	// ids have already been parsed
	return;
	}

	let collector = AttrCollector { attr_name: b"id", ..AttrCollector::default() };
	let sink = parse_html(source, collector);
	for (line_number, id) in sink.found_attrs {
	let encoded = small_url_encode(&id);
	if let Some(id) = ids.replace(id) {
	report.errors += 1;
	println!("{}:{}: id is not unique: `{}`", file, line_number, id);
	}
	// Just in case, we also add the encoded id.
	ids.insert(encoded);
	}
	}