Skip crawler-hostile hosts (LinkedIn) from broken-external-link check

3af991f6 by Isaac Bythewood · 1 day ago

Skip crawler-hostile hosts (LinkedIn) from broken-external-link check

LinkedIn returns 403/404 to anything that doesn't look like a browser, so its
links always show up as broken in the SEO report regardless of their real
state. Filter them out at link-collection time so they're never HEAD'd or
reported.

modified src/crawler/fetcher.rs

@@ -255,3 +255,18 @@ pub fn same_site(url: &str, host: &str) -> bool {    }    false}/// Hosts that aggressively block non-browser HTTP clients, returning 403/404/// to anything that looks like a crawler regardless of the actual link state./// Treating their responses as broken-link signals produces only false/// positives, so we skip them entirely from the external HEAD probe and the/// broken-external-links check.pub fn is_crawler_hostile(url: &str) -> bool {    let Ok(u) = Url::parse(url) else { return false };    let Some(host) = u.host_str() else { return false };    let host = host.to_lowercase();    matches!(        host.as_str(),        "linkedin.com" | "www.linkedin.com"    ) || host.ends_with(".linkedin.com")}

modified src/crawler/mod.rs

@@ -15,7 +15,8 @@ use url::Url;pub use fetcher::PAGE_CAP;use fetcher::{    fetch, head_status, load_robots, load_sitemap, make_client, make_probe_client,    probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY,    is_crawler_hostile, probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS,    CONCURRENCY,};use parser::parse_html;

@@ -218,9 +219,10 @@ where        }        if let Some(html) = &p.html {            for link in &html.links {                if !same_site(&link.url, &host) {                    external_links.insert(link.url.clone());                if same_site(&link.url, &host) || is_crawler_hostile(&link.url) {                    continue;                }                external_links.insert(link.url.clone());            }        }    }