modified
src/crawler/fetcher.rs
@@ -255,3 +255,18 @@ pub fn same_site(url: &str, host: &str) -> bool { } false}/// Hosts that aggressively block non-browser HTTP clients, returning 403/404/// to anything that looks like a crawler regardless of the actual link state./// Treating their responses as broken-link signals produces only false/// positives, so we skip them entirely from the external HEAD probe and the/// broken-external-links check.pub fn is_crawler_hostile(url: &str) -> bool { let Ok(u) = Url::parse(url) else { return false }; let Some(host) = u.host_str() else { return false }; let host = host.to_lowercase(); matches!( host.as_str(), "linkedin.com" | "www.linkedin.com" ) || host.ends_with(".linkedin.com")}
modified
src/crawler/mod.rs
@@ -15,7 +15,8 @@ use url::Url;pub use fetcher::PAGE_CAP;use fetcher::{ fetch, head_status, load_robots, load_sitemap, make_client, make_probe_client, probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY, is_crawler_hostile, probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY,};use parser::parse_html;
@@ -218,9 +219,10 @@ where } if let Some(html) = &p.html { for link in &html.links { if !same_site(&link.url, &host) { external_links.insert(link.url.clone()); if same_site(&link.url, &host) || is_crawler_hostile(&link.url) { continue; } external_links.insert(link.url.clone()); } } }