heartwood every commit a ring

Probe Content-Encoding with a non-decompressing client

86e21172 by Isaac Bythewood · 2 days ago

Probe Content-Encoding with a non-decompressing client

Reqwest's gzip/brotli features auto-decompress responses and strip the
Content-Encoding header, so the per-page compression check always saw
an empty header and falsely flagged every HTML page on every site.
Probe the start URL once with a separate non-decompressing client and
emit one site-level insight only when the server truly skipped
compression.
modified src/crawler/checks.rs
@@ -28,6 +28,9 @@ pub struct Ctx<'a> {    pub external_link_status: &'a HashMap<String, u16>,    pub sitemap_urls: &'a [String],    pub robots: &'a RobotsCtx,    /// Server's `Content-Encoding` for `start_url`, lowercased. `None` means    /// the response was uncompressed.    pub compression: Option<&'a str>,}fn redirect_codes() -> HashSet<u16> {
@@ -713,25 +716,22 @@ fn check_slow_pages(ctx: &Ctx) -> Vec<Value> {}fn check_missing_compression(ctx: &Ctx) -> Vec<Value> {    let mut out = Vec::new();    for p in ctx.html_pages {        let enc = p            .headers            .iter()            .find(|(k, _)| k.eq_ignore_ascii_case("content-encoding"))            .map(|(_, v)| v.to_lowercase())            .unwrap_or_default();        if enc.is_empty() {            out.push(insight(                &p.url,                "Response not compressed (no Content-Encoding header)",                TYPE_PERF,                SEV_INFO,                "",            ));        }    // Probed at the start URL with a non-decompressing client (the main    // crawler client has reqwest's gzip/brotli features on, which strip    // `Content-Encoding` after auto-decompression — so we can't read it from    // the per-page headers). Compression is a server-wide config in practice,    // so one probe is enough.    if ctx.compression.is_none() {        vec![insight(            ctx.start_url,            "Response not compressed (no Content-Encoding header)",            TYPE_PERF,            SEV_INFO,            "",        )]    } else {        Vec::new()    }    out}fn check_oversized_pages(ctx: &Ctx) -> Vec<Value> {
modified src/crawler/fetcher.rs
@@ -32,6 +32,45 @@ pub fn make_client() -> Client {        .expect("client builds")}// Companion client used only to probe the server's `Content-Encoding` header.// The main client has reqwest's `gzip` and `brotli` features on, which// auto-decompress responses *and* strip `Content-Encoding` from the headers,// so we can't tell from a normal fetch whether the server compressed the// response. This client disables auto-decompression so the header survives.pub fn make_probe_client() -> Client {    Client::builder()        .user_agent(USER_AGENT)        .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))        .redirect(reqwest::redirect::Policy::limited(10))        .gzip(false)        .brotli(false)        .build()        .expect("probe client builds")}/// Probe `url` with a non-decompressing client and return the server's/// `Content-Encoding` (lowercased). Returns `None` if the server didn't/// compress, the encoding was `identity`, or the request failed.pub async fn probe_compression(client: &Client, url: &str) -> Option<String> {    let resp = client        .get(url)        .header("Accept-Encoding", "gzip, br, zstd, deflate")        .send()        .await        .ok()?;    let enc = resp        .headers()        .get(reqwest::header::CONTENT_ENCODING)        .and_then(|v| v.to_str().ok())?        .trim()        .to_lowercase();    if enc.is_empty() || enc == "identity" {        None    } else {        Some(enc)    }}pub async fn fetch(client: &Client, url: &str) -> FetchResult {    let started = Instant::now();    match client.get(url).send().await {
modified src/crawler/mod.rs
@@ -14,8 +14,8 @@ use url::Url;pub use fetcher::PAGE_CAP;use fetcher::{    fetch, head_status, load_robots, load_sitemap, make_client, same_site, FetchResult,    CRAWL_DEADLINE_SECS, CONCURRENCY,    fetch, head_status, load_robots, load_sitemap, make_client, make_probe_client,    probe_compression, same_site, FetchResult, CRAWL_DEADLINE_SECS, CONCURRENCY,};use parser::parse_html;
@@ -43,6 +43,11 @@ pub struct CrawlResult {    pub external_link_status: HashMap<String, u16>,    pub sitemap_urls: Vec<String>,    pub robots: RobotsCtx,    /// Server's `Content-Encoding` for the start URL (e.g. "gzip", "br",    /// "zstd"). `None` means the server returned the response uncompressed.    /// Probed with a separate non-decompressing client because reqwest's auto-    /// decompression strips the header.    pub compression: Option<String>,}#[derive(Debug, serde::Serialize)]
@@ -91,6 +96,8 @@ where    let base_origin = format!("{}://{}", parsed.scheme(), parsed.host_str().unwrap_or(""));    let client = make_client();    let probe_client = make_probe_client();    let compression = probe_compression(&probe_client, start_url).await;    let (robots, robots_url, robots_text) = load_robots(&client, &base_origin).await;    let robots = Arc::new(robots);    let sitemap_urls = load_sitemap(&client, &base_origin, robots_text.as_deref()).await;
@@ -247,6 +254,7 @@ where            raw: robots_text,            references_sitemap,        },        compression,    })}
@@ -265,6 +273,7 @@ fn run_checks(result: &CrawlResult) -> Vec<Value> {        external_link_status: &result.external_link_status,        sitemap_urls: &result.sitemap_urls,        robots: &result.robots,        compression: result.compression.as_deref(),    };    checks::run_all(&ctx)}