status · src/crawler/fetcher.rs

9.0 KB raw
use reqwest::Client;
use std::collections::HashMap;
use std::time::{Duration, Instant};
use url::Url;

pub const PAGE_CAP: usize = 500;
pub const CONCURRENCY: usize = 4;
const REQUEST_TIMEOUT_SECS: u64 = 15;
const EXTERNAL_LINK_TIMEOUT_SECS: u64 = 8;
pub const CRAWL_DEADLINE_SECS: u64 = 540;
const USER_AGENT: &str = "status (+https://status.bythewood.me)";

#[derive(Debug, Clone)]
pub struct FetchResult {
    pub url: String,
    pub requested_url: String,
    pub status: u16,
    pub headers: HashMap<String, String>,
    pub body: Vec<u8>,
    pub content_type: String,
    pub elapsed_ms: i64,
    pub redirect_chain: Vec<(u16, String)>,
    pub error: String,
}

pub fn make_client() -> Client {
    Client::builder()
        .user_agent(USER_AGENT)
        .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
        .redirect(reqwest::redirect::Policy::limited(10))
        .build()
        .expect("client builds")
}

// Companion client used only to probe the server's `Content-Encoding` header.
// The main client has reqwest's `gzip` and `brotli` features on, which
// auto-decompress responses *and* strip `Content-Encoding` from the headers,
// so we can't tell from a normal fetch whether the server compressed the
// response. This client disables auto-decompression so the header survives.
pub fn make_probe_client() -> Client {
    Client::builder()
        .user_agent(USER_AGENT)
        .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
        .redirect(reqwest::redirect::Policy::limited(10))
        .gzip(false)
        .brotli(false)
        .build()
        .expect("probe client builds")
}

/// Probe `url` with a non-decompressing client and return the server's
/// `Content-Encoding` (lowercased). Returns `None` if the server didn't
/// compress, the encoding was `identity`, or the request failed.
pub async fn probe_compression(client: &Client, url: &str) -> Option<String> {
    let resp = client
        .get(url)
        .header("Accept-Encoding", "gzip, br, zstd, deflate")
        .send()
        .await
        .ok()?;
    let enc = resp
        .headers()
        .get(reqwest::header::CONTENT_ENCODING)
        .and_then(|v| v.to_str().ok())?
        .trim()
        .to_lowercase();
    if enc.is_empty() || enc == "identity" {
        None
    } else {
        Some(enc)
    }
}

pub async fn fetch(client: &Client, url: &str) -> FetchResult {
    let started = Instant::now();
    match client.get(url).send().await {
        Ok(resp) => {
            let final_url = resp.url().to_string();
            let status = resp.status().as_u16();
            let mut headers = HashMap::new();
            for (k, v) in resp.headers().iter() {
                if let Ok(s) = v.to_str() {
                    headers.insert(k.as_str().to_string(), s.to_string());
                }
            }
            let content_type = headers
                .iter()
                .find(|(k, _)| k.eq_ignore_ascii_case("content-type"))
                .map(|(_, v)| v.to_lowercase())
                .unwrap_or_default();
            let body = if content_type.contains("text/html") {
                resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default()
            } else {
                Vec::new()
            };
            let elapsed_ms = started.elapsed().as_millis() as i64;
            // Reqwest doesn't expose the redirect chain, so we approximate
            // with [final_status, final_url]. Mid-chain hops are lost.
            let redirect_chain = vec![(status, final_url.clone())];
            FetchResult {
                url: final_url,
                requested_url: url.to_string(),
                status,
                headers,
                body,
                content_type,
                elapsed_ms,
                redirect_chain,
                error: String::new(),
            }
        }
        Err(e) => FetchResult {
            url: url.to_string(),
            requested_url: url.to_string(),
            status: 0,
            headers: HashMap::new(),
            body: Vec::new(),
            content_type: String::new(),
            elapsed_ms: started.elapsed().as_millis() as i64,
            redirect_chain: Vec::new(),
            error: e.to_string(),
        },
    }
}

pub async fn head_status(client: &Client, url: &str) -> u16 {
    let timeout = Duration::from_secs(EXTERNAL_LINK_TIMEOUT_SECS);
    match client.head(url).timeout(timeout).send().await {
        Ok(r) => {
            let s = r.status().as_u16();
            if matches!(s, 403 | 405 | 501) {
                client
                    .get(url)
                    .timeout(timeout)
                    .send()
                    .await
                    .map(|r| r.status().as_u16())
                    .unwrap_or(0)
            } else {
                s
            }
        }
        Err(_) => 0,
    }
}

/// Robots.txt evaluator. Uses the `robotstxt` crate. Treats parse errors
/// or missing files as "allow everything" so a broken robots.txt doesn't
/// tank the crawl.
pub struct Robots {
    text: Option<String>,
}

impl Robots {
    pub fn allowed(&self, url: &str) -> bool {
        let Some(text) = &self.text else { return true };
        let mut matcher = robotstxt::DefaultMatcher::default();
        matcher.one_agent_allowed_by_robots(text, "*", url)
    }
    fn empty() -> Self {
        Self { text: None }
    }
}

pub async fn load_robots(client: &Client, base_origin: &str) -> (Robots, String, Option<String>) {
    let robots_url = format!("{base_origin}/robots.txt");
    let mut robots = Robots::empty();
    let mut raw: Option<String> = None;
    if let Ok(r) = client.get(&robots_url).send().await {
        if r.status().as_u16() == 200 {
            if let Ok(text) = r.text().await {
                robots.text = Some(text.clone());
                raw = Some(text);
            }
        }
    }
    (robots, robots_url, raw)
}

pub async fn load_sitemap(
    client: &Client,
    base_origin: &str,
    robots_text: Option<&str>,
) -> Vec<String> {
    let mut candidates: Vec<String> = Vec::new();
    if let Some(text) = robots_text {
        for line in text.lines() {
            let line = line.trim();
            if let Some(rest) = line.to_lowercase().strip_prefix("sitemap:") {
                // Reuse the original to preserve casing of the URL.
                let original_after = &line[line.len() - rest.len()..];
                candidates.push(original_after.trim().to_string());
            }
        }
    }
    if candidates.is_empty() {
        candidates.push(format!("{base_origin}/sitemap.xml"));
    }

    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
    let mut urls: Vec<String> = Vec::new();
    let mut to_fetch: Vec<String> = candidates;
    while let Some(smurl) = to_fetch.pop() {
        if seen.len() >= 20 {
            break;
        }
        if seen.contains(&smurl) {
            continue;
        }
        seen.insert(smurl.clone());
        let r = match client.get(&smurl).send().await {
            Ok(r) => r,
            Err(_) => continue,
        };
        if r.status().as_u16() != 200 {
            continue;
        }
        let body = match r.bytes().await {
            Ok(b) => b,
            Err(_) => continue,
        };
        for loc in parse_sitemap_xml(&body) {
            let lower = loc.to_lowercase();
            if lower.ends_with(".xml") || lower.contains("sitemap") {
                to_fetch.push(loc);
            } else {
                urls.push(loc);
            }
        }
    }
    urls
}

/// Pull `<loc>` text from a sitemap XML body. Lightweight regex-based
/// extraction; sitemaps are well-formed enough that we don't need a full
/// XML parser, and adding one just for this would be overkill.
fn parse_sitemap_xml(body: &[u8]) -> Vec<String> {
    let s = String::from_utf8_lossy(body);
    let re = regex::Regex::new(r"(?is)<loc>\s*([^<]+?)\s*</loc>").expect("regex");
    re.captures_iter(&s)
        .map(|c| c[1].trim().to_string())
        .collect()
}

pub fn same_site(url: &str, host: &str) -> bool {
    let Ok(u) = Url::parse(url) else { return false };
    let h_lower = host.to_lowercase();
    let Some(u_host) = u.host_str() else { return false };
    let u_lower = u_host.to_lowercase();
    if u_lower == h_lower {
        return true;
    }
    if u_lower == format!("www.{h_lower}") || h_lower == format!("www.{u_lower}") {
        return true;
    }
    false
}

/// Hosts that aggressively block non-browser HTTP clients, returning 403/404
/// to anything that looks like a crawler regardless of the actual link state.
/// Treating their responses as broken-link signals produces only false
/// positives, so we skip them entirely from the external HEAD probe and the
/// broken-external-links check.
pub fn is_crawler_hostile(url: &str) -> bool {
    let Ok(u) = Url::parse(url) else { return false };
    let Some(host) = u.host_str() else { return false };
    let host = host.to_lowercase();
    matches!(
        host.as_str(),
        "linkedin.com" | "www.linkedin.com"
    ) || host.ends_with(".linkedin.com")
}