9.0 KB
raw
use reqwest::Client;
use std::collections::HashMap;
use std::time::{Duration, Instant};
use url::Url;
pub const PAGE_CAP: usize = 500;
pub const CONCURRENCY: usize = 4;
const REQUEST_TIMEOUT_SECS: u64 = 15;
const EXTERNAL_LINK_TIMEOUT_SECS: u64 = 8;
pub const CRAWL_DEADLINE_SECS: u64 = 540;
const USER_AGENT: &str = "status (+https://status.bythewood.me)";
#[derive(Debug, Clone)]
pub struct FetchResult {
pub url: String,
pub requested_url: String,
pub status: u16,
pub headers: HashMap<String, String>,
pub body: Vec<u8>,
pub content_type: String,
pub elapsed_ms: i64,
pub redirect_chain: Vec<(u16, String)>,
pub error: String,
}
pub fn make_client() -> Client {
Client::builder()
.user_agent(USER_AGENT)
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.expect("client builds")
}
// Companion client used only to probe the server's `Content-Encoding` header.
// The main client has reqwest's `gzip` and `brotli` features on, which
// auto-decompress responses *and* strip `Content-Encoding` from the headers,
// so we can't tell from a normal fetch whether the server compressed the
// response. This client disables auto-decompression so the header survives.
pub fn make_probe_client() -> Client {
Client::builder()
.user_agent(USER_AGENT)
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
.redirect(reqwest::redirect::Policy::limited(10))
.gzip(false)
.brotli(false)
.build()
.expect("probe client builds")
}
/// Probe `url` with a non-decompressing client and return the server's
/// `Content-Encoding` (lowercased). Returns `None` if the server didn't
/// compress, the encoding was `identity`, or the request failed.
pub async fn probe_compression(client: &Client, url: &str) -> Option<String> {
let resp = client
.get(url)
.header("Accept-Encoding", "gzip, br, zstd, deflate")
.send()
.await
.ok()?;
let enc = resp
.headers()
.get(reqwest::header::CONTENT_ENCODING)
.and_then(|v| v.to_str().ok())?
.trim()
.to_lowercase();
if enc.is_empty() || enc == "identity" {
None
} else {
Some(enc)
}
}
pub async fn fetch(client: &Client, url: &str) -> FetchResult {
let started = Instant::now();
match client.get(url).send().await {
Ok(resp) => {
let final_url = resp.url().to_string();
let status = resp.status().as_u16();
let mut headers = HashMap::new();
for (k, v) in resp.headers().iter() {
if let Ok(s) = v.to_str() {
headers.insert(k.as_str().to_string(), s.to_string());
}
}
let content_type = headers
.iter()
.find(|(k, _)| k.eq_ignore_ascii_case("content-type"))
.map(|(_, v)| v.to_lowercase())
.unwrap_or_default();
let body = if content_type.contains("text/html") {
resp.bytes().await.map(|b| b.to_vec()).unwrap_or_default()
} else {
Vec::new()
};
let elapsed_ms = started.elapsed().as_millis() as i64;
// Reqwest doesn't expose the redirect chain, so we approximate
// with [final_status, final_url]. Mid-chain hops are lost.
let redirect_chain = vec![(status, final_url.clone())];
FetchResult {
url: final_url,
requested_url: url.to_string(),
status,
headers,
body,
content_type,
elapsed_ms,
redirect_chain,
error: String::new(),
}
}
Err(e) => FetchResult {
url: url.to_string(),
requested_url: url.to_string(),
status: 0,
headers: HashMap::new(),
body: Vec::new(),
content_type: String::new(),
elapsed_ms: started.elapsed().as_millis() as i64,
redirect_chain: Vec::new(),
error: e.to_string(),
},
}
}
pub async fn head_status(client: &Client, url: &str) -> u16 {
let timeout = Duration::from_secs(EXTERNAL_LINK_TIMEOUT_SECS);
match client.head(url).timeout(timeout).send().await {
Ok(r) => {
let s = r.status().as_u16();
if matches!(s, 403 | 405 | 501) {
client
.get(url)
.timeout(timeout)
.send()
.await
.map(|r| r.status().as_u16())
.unwrap_or(0)
} else {
s
}
}
Err(_) => 0,
}
}
/// Robots.txt evaluator. Uses the `robotstxt` crate. Treats parse errors
/// or missing files as "allow everything" so a broken robots.txt doesn't
/// tank the crawl.
pub struct Robots {
text: Option<String>,
}
impl Robots {
pub fn allowed(&self, url: &str) -> bool {
let Some(text) = &self.text else { return true };
let mut matcher = robotstxt::DefaultMatcher::default();
matcher.one_agent_allowed_by_robots(text, "*", url)
}
fn empty() -> Self {
Self { text: None }
}
}
pub async fn load_robots(client: &Client, base_origin: &str) -> (Robots, String, Option<String>) {
let robots_url = format!("{base_origin}/robots.txt");
let mut robots = Robots::empty();
let mut raw: Option<String> = None;
if let Ok(r) = client.get(&robots_url).send().await {
if r.status().as_u16() == 200 {
if let Ok(text) = r.text().await {
robots.text = Some(text.clone());
raw = Some(text);
}
}
}
(robots, robots_url, raw)
}
pub async fn load_sitemap(
client: &Client,
base_origin: &str,
robots_text: Option<&str>,
) -> Vec<String> {
let mut candidates: Vec<String> = Vec::new();
if let Some(text) = robots_text {
for line in text.lines() {
let line = line.trim();
if let Some(rest) = line.to_lowercase().strip_prefix("sitemap:") {
// Reuse the original to preserve casing of the URL.
let original_after = &line[line.len() - rest.len()..];
candidates.push(original_after.trim().to_string());
}
}
}
if candidates.is_empty() {
candidates.push(format!("{base_origin}/sitemap.xml"));
}
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut urls: Vec<String> = Vec::new();
let mut to_fetch: Vec<String> = candidates;
while let Some(smurl) = to_fetch.pop() {
if seen.len() >= 20 {
break;
}
if seen.contains(&smurl) {
continue;
}
seen.insert(smurl.clone());
let r = match client.get(&smurl).send().await {
Ok(r) => r,
Err(_) => continue,
};
if r.status().as_u16() != 200 {
continue;
}
let body = match r.bytes().await {
Ok(b) => b,
Err(_) => continue,
};
for loc in parse_sitemap_xml(&body) {
let lower = loc.to_lowercase();
if lower.ends_with(".xml") || lower.contains("sitemap") {
to_fetch.push(loc);
} else {
urls.push(loc);
}
}
}
urls
}
/// Pull `<loc>` text from a sitemap XML body. Lightweight regex-based
/// extraction; sitemaps are well-formed enough that we don't need a full
/// XML parser, and adding one just for this would be overkill.
fn parse_sitemap_xml(body: &[u8]) -> Vec<String> {
let s = String::from_utf8_lossy(body);
let re = regex::Regex::new(r"(?is)<loc>\s*([^<]+?)\s*</loc>").expect("regex");
re.captures_iter(&s)
.map(|c| c[1].trim().to_string())
.collect()
}
pub fn same_site(url: &str, host: &str) -> bool {
let Ok(u) = Url::parse(url) else { return false };
let h_lower = host.to_lowercase();
let Some(u_host) = u.host_str() else { return false };
let u_lower = u_host.to_lowercase();
if u_lower == h_lower {
return true;
}
if u_lower == format!("www.{h_lower}") || h_lower == format!("www.{u_lower}") {
return true;
}
false
}
/// Hosts that aggressively block non-browser HTTP clients, returning 403/404
/// to anything that looks like a crawler regardless of the actual link state.
/// Treating their responses as broken-link signals produces only false
/// positives, so we skip them entirely from the external HEAD probe and the
/// broken-external-links check.
pub fn is_crawler_hostile(url: &str) -> bool {
let Ok(u) = Url::parse(url) else { return false };
let Some(host) = u.host_str() else { return false };
let host = host.to_lowercase();
matches!(
host.as_str(),
"linkedin.com" | "www.linkedin.com"
) || host.ends_with(".linkedin.com")
}