modified
src/crawler/checks.rs
@@ -28,6 +28,9 @@ pub struct Ctx<'a> { pub external_link_status: &'a HashMap<String, u16>, pub sitemap_urls: &'a [String], pub robots: &'a RobotsCtx, /// Server's `Content-Encoding` for `start_url`, lowercased. `None` means /// the response was uncompressed. pub compression: Option<&'a str>,}fn redirect_codes() -> HashSet<u16> {
@@ -713,25 +716,22 @@ fn check_slow_pages(ctx: &Ctx) -> Vec<Value> {}fn check_missing_compression(ctx: &Ctx) -> Vec<Value> { let mut out = Vec::new(); for p in ctx.html_pages { let enc = p .headers .iter() .find(|(k, _)| k.eq_ignore_ascii_case("content-encoding")) .map(|(_, v)| v.to_lowercase()) .unwrap_or_default(); if enc.is_empty() { out.push(insight( &p.url, "Response not compressed (no Content-Encoding header)", TYPE_PERF, SEV_INFO, "", )); } // Probed at the start URL with a non-decompressing client (the main // crawler client has reqwest's gzip/brotli features on, which strip // `Content-Encoding` after auto-decompression — so we can't read it from // the per-page headers). Compression is a server-wide config in practice, // so one probe is enough. if ctx.compression.is_none() { vec![insight( ctx.start_url, "Response not compressed (no Content-Encoding header)", TYPE_PERF, SEV_INFO, "", )] } else { Vec::new() } out}fn check_oversized_pages(ctx: &Ctx) -> Vec<Value> {
modified
src/crawler/fetcher.rs
@@ -32,6 +32,45 @@ pub fn make_client() -> Client { .expect("client builds")}// Companion client used only to probe the server's `Content-Encoding` header.// The main client has reqwest's `gzip` and `brotli` features on, which// auto-decompress responses *and* strip `Content-Encoding` from the headers,// so we can't tell from a normal fetch whether the server compressed the// response. This client disables auto-decompression so the header survives.pub fn make_probe_client() -> Client { Client::builder() .user_agent(USER_AGENT) .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS)) .redirect(reqwest::redirect::Policy::limited(10)) .gzip(false) .brotli(false) .build() .expect("probe client builds")}/// Probe `url` with a non-decompressing client and return the server's/// `Content-Encoding` (lowercased). Returns `None` if the server didn't/// compress, the encoding was `identity`, or the request failed.pub async fn probe_compression(client: &Client, url: &str) -> Option<String> { let resp = client .get(url) .header("Accept-Encoding", "gzip, br, zstd, deflate") .send() .await .ok()?; let enc = resp .headers() .get(reqwest::header::CONTENT_ENCODING) .and_then(|v| v.to_str().ok())? .trim() .to_lowercase(); if enc.is_empty() || enc == "identity" { None } else { Some(enc) }}pub async fn fetch(client: &Client, url: &str) -> FetchResult { let started = Instant::now(); match client.get(url).send().await {
modified
src/crawler/mod.rs
@@ -14,8 +14,8 @@ use url::Url;pub use fetcher::PAGE_CAP;use fetcher::{ fetch, head_status, load_robots, load_sitemap, make_client, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY, fetch, head_status, load_robots, load_sitemap, make_client, make_probe_client, probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY,};use parser::parse_html;
@@ -43,6 +43,11 @@ pub struct CrawlResult { pub external_link_status: HashMap<String, u16>, pub sitemap_urls: Vec<String>, pub robots: RobotsCtx, /// Server's `Content-Encoding` for the start URL (e.g. "gzip", "br", /// "zstd"). `None` means the server returned the response uncompressed. /// Probed with a separate non-decompressing client because reqwest's auto- /// decompression strips the header. pub compression: Option<String>,}#[derive(Debug, serde::Serialize)]
@@ -91,6 +96,8 @@ where let base_origin = format!("{}://{}", parsed.scheme(), parsed.host_str().unwrap_or("")); let client = make_client(); let probe_client = make_probe_client(); let compression = probe_compression(&probe_client, start_url).await; let (robots, robots_url, robots_text) = load_robots(&client, &base_origin).await; let robots = Arc::new(robots); let sitemap_urls = load_sitemap(&client, &base_origin, robots_text.as_deref()).await;
@@ -247,6 +254,7 @@ where raw: robots_text, references_sitemap, }, compression, })}
@@ -265,6 +273,7 @@ fn run_checks(result: &CrawlResult) -> Vec<Value> { external_link_status: &result.external_link_status, sitemap_urls: &result.sitemap_urls, robots: &result.robots, compression: result.compression.as_deref(), }; checks::run_all(&ctx)}