status · src/crawler/checks.rs

24.3 KB raw
//! SEO / accessibility / performance / content / security checks.
//!
//! Each check takes a `Ctx` and returns a list of insight values
//! (`{url, issue, item, type, severity}` JSON objects). Direct port of
//! `crawler/checks.py`.

use super::{fetcher::same_site, Page, RobotsCtx};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};

const TYPE_SEO: &str = "seo";
const TYPE_LINKS: &str = "links";
const TYPE_A11Y: &str = "accessibility";
const TYPE_CONTENT: &str = "content";
const TYPE_PERF: &str = "performance";
const TYPE_SEC: &str = "security";

const SEV_ERROR: &str = "error";
const SEV_WARN: &str = "warning";
const SEV_INFO: &str = "info";

pub struct Ctx<'a> {
    pub start_url: &'a str,
    pub host: &'a str,
    pub pages: &'a [Page],
    pub html_pages: &'a [&'a Page],
    pub status_map: &'a HashMap<String, u16>,
    pub external_link_status: &'a HashMap<String, u16>,
    pub sitemap_urls: &'a [String],
    pub robots: &'a RobotsCtx,
    /// Server's `Content-Encoding` for `start_url`, lowercased. `None` means
    /// the response was uncompressed.
    pub compression: Option<&'a str>,
}

fn redirect_codes() -> HashSet<u16> {
    [301, 302, 303, 307, 308].into_iter().collect()
}

fn insight(url: &str, issue: &str, type_: &str, severity: &str, item: &str) -> Value {
    json!({
        "url": url,
        "issue": issue,
        "item": item,
        "type": type_,
        "severity": severity,
    })
}

fn normalize(s: &str) -> String {
    s.to_lowercase().split_whitespace().collect::<Vec<_>>().join(" ")
}

fn group_by<'a, F: Fn(&Page) -> String>(
    pages: &[&'a Page],
    f: F,
) -> HashMap<String, Vec<&'a Page>> {
    let mut out: HashMap<String, Vec<&'a Page>> = HashMap::new();
    for p in pages {
        let v = f(p);
        if !v.is_empty() {
            out.entry(normalize(&v)).or_default().push(p);
        }
    }
    out
}

fn html<'a>(p: &'a Page) -> &'a super::parser::ParsedHtml {
    p.html.as_ref().expect("called on html page")
}

// ---------- core metadata ----------

fn check_title_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).title.is_empty())
        .map(|p| insight(&p.url, "Page has no title", TYPE_SEO, SEV_ERROR, ""))
        .collect()
}

fn check_title_length(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let t = &html(p).title;
        let n = t.chars().count();
        if !t.is_empty() && !(30..=60).contains(&n) {
            out.push(insight(
                &p.url,
                &format!("Title length is {n} chars (recommended 30-60)"),
                TYPE_SEO,
                SEV_WARN,
                t,
            ));
        }
    }
    out
}

fn check_duplicate_titles(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for (_, group) in group_by(ctx.html_pages, |p| html(p).title.clone()) {
        if group.len() > 1 {
            for p in group {
                out.push(insight(&p.url, "Duplicate title", TYPE_SEO, SEV_WARN, &html(p).title));
            }
        }
    }
    out
}

fn check_description_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).description.is_empty())
        .map(|p| insight(&p.url, "Page has no meta description", TYPE_SEO, SEV_ERROR, ""))
        .collect()
}

fn check_description_length(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let d = &html(p).description;
        let n = d.chars().count();
        if !d.is_empty() && !(70..=160).contains(&n) {
            out.push(insight(
                &p.url,
                &format!("Description length is {n} chars (recommended 70-160)"),
                TYPE_SEO,
                SEV_WARN,
                d,
            ));
        }
    }
    out
}

fn check_duplicate_descriptions(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for (_, group) in group_by(ctx.html_pages, |p| html(p).description.clone()) {
        if group.len() > 1 {
            for p in group {
                out.push(insight(
                    &p.url,
                    "Duplicate meta description",
                    TYPE_SEO,
                    SEV_WARN,
                    &html(p).description,
                ));
            }
        }
    }
    out
}

fn h1s<'a>(p: &'a Page) -> &'a [String] {
    html(p)
        .headings
        .get("h1")
        .map(|v| v.as_slice())
        .unwrap_or(&[])
}

fn check_h1_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| h1s(p).is_empty())
        .map(|p| insight(&p.url, "Page has no h1", TYPE_SEO, SEV_ERROR, ""))
        .collect()
}

fn check_h1_multiple(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let v = h1s(p);
        if v.len() > 1 {
            let item = v.iter().take(3).cloned().collect::<Vec<_>>().join(" | ");
            out.push(insight(
                &p.url,
                &format!("Page has {} h1 tags (expected 1)", v.len()),
                TYPE_SEO,
                SEV_WARN,
                &item,
            ));
        }
    }
    out
}

fn check_h1_length(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let v = h1s(p);
        if let Some(first) = v.first() {
            let n = first.chars().count();
            if !(20..=70).contains(&n) {
                out.push(insight(
                    &p.url,
                    &format!("H1 length is {n} chars (recommended 20-70)"),
                    TYPE_SEO,
                    SEV_WARN,
                    first,
                ));
            }
        }
    }
    out
}

fn check_duplicate_h1s(ctx: &Ctx) -> Vec<Value> {
    let mut buckets: HashMap<String, Vec<(String, String)>> = HashMap::new();
    for p in ctx.html_pages {
        if let Some(first) = h1s(p).first() {
            buckets.entry(normalize(first)).or_default().push((p.url.clone(), first.clone()));
        }
    }
    let mut out = Vec::new();
    for (_, group) in buckets {
        if group.len() > 1 {
            for (url, item) in group {
                out.push(insight(&url, "Duplicate h1", TYPE_SEO, SEV_WARN, &item));
            }
        }
    }
    out
}

fn check_heading_hierarchy(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let h = &html(p).headings;
        let levels: Vec<u8> = (1..=6u8)
            .filter(|lvl| {
                h.get(&format!("h{lvl}"))
                    .map(|v| !v.is_empty())
                    .unwrap_or(false)
            })
            .collect();
        for i in 1..levels.len() {
            if levels[i] - levels[i - 1] > 1 {
                out.push(insight(
                    &p.url,
                    &format!("Heading hierarchy skips from h{} to h{}", levels[i - 1], levels[i]),
                    TYPE_SEO,
                    SEV_INFO,
                    "",
                ));
                break;
            }
        }
    }
    out
}

fn check_canonical_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).canonical.is_empty())
        .map(|p| insight(&p.url, "Page has no canonical URL", TYPE_SEO, SEV_WARN, ""))
        .collect()
}

fn check_canonical_offdomain(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let c = &html(p).canonical;
        if !c.is_empty() && !same_site(c, ctx.host) {
            out.push(insight(&p.url, "Canonical URL points off-domain", TYPE_SEO, SEV_WARN, c));
        }
    }
    out
}

fn check_canonical_broken(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let c = &html(p).canonical;
        if !c.is_empty() {
            if let Some(s) = ctx.status_map.get(c) {
                if *s != 200 {
                    out.push(insight(
                        &p.url,
                        &format!("Canonical URL returns {s}"),
                        TYPE_SEO,
                        SEV_ERROR,
                        c,
                    ));
                }
            }
        }
    }
    out
}

fn check_robots_meta_noindex(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let rm = &html(p).robots_meta;
        if rm.to_lowercase().contains("noindex") {
            out.push(insight(
                &p.url,
                "Page has noindex in meta robots tag",
                TYPE_SEO,
                SEV_WARN,
                rm,
            ));
        }
    }
    out
}

fn check_lang_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).lang.is_empty())
        .map(|p| insight(&p.url, "HTML lang attribute missing", TYPE_SEO, SEV_WARN, ""))
        .collect()
}

fn check_viewport_missing(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).viewport.is_empty())
        .map(|p| insight(&p.url, "Viewport meta tag missing (mobile)", TYPE_SEO, SEV_WARN, ""))
        .collect()
}

fn check_og_incomplete(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let og = &html(p).og;
        let mut missing: Vec<&str> = Vec::new();
        if og.title.is_empty() { missing.push("og:title"); }
        if og.description.is_empty() { missing.push("og:description"); }
        if og.image.is_empty() { missing.push("og:image"); }
        if og.url.is_empty() { missing.push("og:url"); }
        if !missing.is_empty() {
            out.push(insight(
                &p.url,
                &format!("Open Graph tags missing: {}", missing.join(", ")),
                TYPE_SEO,
                SEV_INFO,
                "",
            ));
        }
    }
    out
}

fn check_twitter_card(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).twitter.card.is_empty())
        .map(|p| insight(&p.url, "Twitter card meta tag missing", TYPE_SEO, SEV_INFO, ""))
        .collect()
}

fn check_favicon(ctx: &Ctx) -> Vec<Value> {
    ctx.html_pages
        .iter()
        .filter(|p| html(p).favicon.is_empty())
        .map(|p| insight(&p.url, "Favicon link missing", TYPE_SEO, SEV_INFO, ""))
        .collect()
}

fn check_json_ld_parse_error(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        for v in &html(p).json_ld {
            if v.is_null() {
                out.push(insight(
                    &p.url,
                    "JSON-LD structured data failed to parse",
                    TYPE_SEO,
                    SEV_WARN,
                    "",
                ));
                break;
            }
        }
    }
    out
}

// ---------- links ----------

fn check_broken_internal_links(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    let mut reported: HashSet<(String, String)> = HashSet::new();
    let redirects = redirect_codes();
    for p in ctx.html_pages {
        for link in &html(p).links {
            if !same_site(&link.url, ctx.host) {
                continue;
            }
            let Some(s) = ctx.status_map.get(&link.url) else { continue };
            if *s != 200 && !redirects.contains(s) {
                let key = (p.url.clone(), link.url.clone());
                if !reported.insert(key) {
                    continue;
                }
                let label = if *s == 0 { "unreachable".to_string() } else { format!("status {s}") };
                out.push(insight(
                    &p.url,
                    &format!("Broken internal link ({label})"),
                    TYPE_LINKS,
                    SEV_ERROR,
                    &link.url,
                ));
            }
        }
    }
    out
}

fn check_broken_external_links(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    let mut reported: HashSet<(String, String)> = HashSet::new();
    for p in ctx.html_pages {
        for link in &html(p).links {
            if same_site(&link.url, ctx.host) {
                continue;
            }
            let Some(s) = ctx.external_link_status.get(&link.url) else { continue };
            if *s == 0 || *s >= 400 {
                let key = (p.url.clone(), link.url.clone());
                if !reported.insert(key) {
                    continue;
                }
                let label = if *s == 0 { "unreachable".to_string() } else { format!("status {s}") };
                out.push(insight(
                    &p.url,
                    &format!("Broken external link ({label})"),
                    TYPE_LINKS,
                    SEV_WARN,
                    &link.url,
                ));
            }
        }
    }
    out
}

fn check_redirect_chains(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.pages {
        if p.redirect_chain.len() > 2 {
            let hops = p.redirect_chain.len() - 1;
            let codes = p
                .redirect_chain
                .iter()
                .map(|(c, _)| c.to_string())
                .collect::<Vec<_>>()
                .join(" -> ");
            out.push(insight(
                &p.url,
                &format!("Redirect chain has {hops} hops"),
                TYPE_LINKS,
                SEV_INFO,
                &codes,
            ));
        }
    }
    out
}

fn check_nofollow_internal_links(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    let mut reported: HashSet<(String, String)> = HashSet::new();
    for p in ctx.html_pages {
        for link in &html(p).links {
            if !same_site(&link.url, ctx.host) {
                continue;
            }
            if link.rel.iter().any(|r| r == "nofollow") {
                let key = (p.url.clone(), link.url.clone());
                if !reported.insert(key) {
                    continue;
                }
                out.push(insight(
                    &p.url,
                    "Internal link has rel=nofollow",
                    TYPE_LINKS,
                    SEV_INFO,
                    &link.url,
                ));
            }
        }
    }
    out
}

// ---------- robots / sitemap ----------

fn check_robots_missing(ctx: &Ctx) -> Vec<Value> {
    if !ctx.robots.exists {
        vec![insight(
            ctx.start_url,
            "robots.txt missing",
            TYPE_SEO,
            SEV_WARN,
            &ctx.robots.url,
        )]
    } else {
        Vec::new()
    }
}

fn check_sitemap_missing(ctx: &Ctx) -> Vec<Value> {
    if ctx.sitemap_urls.is_empty() {
        vec![insight(
            ctx.start_url,
            "sitemap.xml missing or empty",
            TYPE_SEO,
            SEV_WARN,
            "",
        )]
    } else {
        Vec::new()
    }
}

fn check_sitemap_not_in_robots(ctx: &Ctx) -> Vec<Value> {
    if ctx.robots.exists && !ctx.sitemap_urls.is_empty() && !ctx.robots.references_sitemap {
        vec![insight(
            ctx.start_url,
            "robots.txt does not reference a sitemap",
            TYPE_SEO,
            SEV_INFO,
            "",
        )]
    } else {
        Vec::new()
    }
}

fn check_sitemap_broken_urls(ctx: &Ctx) -> Vec<Value> {
    let redirects = redirect_codes();
    let mut out = Vec::new();
    for url in ctx.sitemap_urls {
        if let Some(s) = ctx.status_map.get(url) {
            if *s != 200 && !redirects.contains(s) {
                out.push(insight(
                    url,
                    &format!("URL listed in sitemap returns {s}"),
                    TYPE_SEO,
                    SEV_ERROR,
                    "",
                ));
            }
        }
    }
    out
}

fn check_pages_missing_from_sitemap(ctx: &Ctx) -> Vec<Value> {
    if ctx.sitemap_urls.is_empty() {
        return Vec::new();
    }
    let set: HashSet<&str> = ctx.sitemap_urls.iter().map(|s| s.as_str()).collect();
    let mut out = Vec::new();
    for p in ctx.html_pages {
        if set.contains(p.url.as_str()) {
            continue;
        }
        if html(p).robots_meta.to_lowercase().contains("noindex") {
            continue;
        }
        out.push(insight(&p.url, "Page not listed in sitemap", TYPE_SEO, SEV_INFO, ""));
    }
    out
}

// ---------- accessibility ----------

fn check_images_missing_alt(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let missing: Vec<_> = html(p).images.iter().filter(|i| i.alt.is_none()).collect();
        if !missing.is_empty() {
            let item: String = missing[0]
                .src
                .chars()
                .take(160)
                .collect();
            out.push(insight(
                &p.url,
                &format!("{} image(s) missing alt attribute", missing.len()),
                TYPE_A11Y,
                SEV_WARN,
                &item,
            ));
        }
    }
    out
}

fn check_empty_anchor_text(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let empty: Vec<_> = html(p).links.iter().filter(|l| l.text.is_empty()).collect();
        if !empty.is_empty() {
            let item: String = empty[0].url.chars().take(160).collect();
            out.push(insight(
                &p.url,
                &format!("{} link(s) have no visible text", empty.len()),
                TYPE_A11Y,
                SEV_INFO,
                &item,
            ));
        }
    }
    out
}

fn check_form_inputs_unlabeled(ctx: &Ctx) -> Vec<Value> {
    let ignore: HashSet<&str> = ["hidden", "submit", "button", "reset", "image"].into_iter().collect();
    let mut out = Vec::new();
    'outer: for p in ctx.html_pages {
        for form in &html(p).forms {
            let label_set: HashSet<&str> = form.label_fors.iter().map(|s| s.as_str()).collect();
            let mut unlabeled = 0;
            for i in &form.inputs {
                if ignore.contains(i.r#type.to_lowercase().as_str()) {
                    continue;
                }
                if i.aria_label.is_some() {
                    continue;
                }
                if let Some(id) = &i.id {
                    if label_set.contains(id.as_str()) {
                        continue;
                    }
                }
                unlabeled += 1;
            }
            if unlabeled > 0 {
                out.push(insight(
                    &p.url,
                    &format!("{unlabeled} form input(s) without associated label"),
                    TYPE_A11Y,
                    SEV_WARN,
                    &form.action,
                ));
                continue 'outer;
            }
        }
    }
    out
}

// ---------- content ----------

fn check_thin_content(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        let wc = html(p).word_count;
        if wc < 300 {
            out.push(insight(
                &p.url,
                &format!("Thin content ({wc} words)"),
                TYPE_CONTENT,
                SEV_WARN,
                "",
            ));
        }
    }
    out
}

fn check_duplicate_content(ctx: &Ctx) -> Vec<Value> {
    let mut buckets: HashMap<String, Vec<String>> = HashMap::new();
    for p in ctx.html_pages {
        let th = &html(p).text_hash;
        if !th.is_empty() {
            buckets.entry(th.clone()).or_default().push(p.url.clone());
        }
    }
    let mut out = Vec::new();
    for urls in buckets.values() {
        if urls.len() > 1 {
            for u in urls {
                let other = urls.iter().find(|x| *x != u).unwrap_or(&urls[0]);
                out.push(insight(
                    u,
                    "Page has duplicate visible content with another page",
                    TYPE_CONTENT,
                    SEV_WARN,
                    other,
                ));
            }
        }
    }
    out
}

// ---------- performance ----------

fn check_slow_pages(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.pages {
        if !p.is_html {
            continue;
        }
        if p.elapsed_ms > 1000 {
            out.push(insight(
                &p.url,
                &format!("Slow response ({} ms)", p.elapsed_ms),
                TYPE_PERF,
                SEV_WARN,
                "",
            ));
        }
    }
    out
}

fn check_missing_compression(ctx: &Ctx) -> Vec<Value> {
    // Probed at the start URL with a non-decompressing client (the main
    // crawler client has reqwest's gzip/brotli features on, which strip
    // `Content-Encoding` after auto-decompression — so we can't read it from
    // the per-page headers). Compression is a server-wide config in practice,
    // so one probe is enough.
    if ctx.compression.is_none() {
        vec![insight(
            ctx.start_url,
            "Response not compressed (no Content-Encoding header)",
            TYPE_PERF,
            SEV_INFO,
            "",
        )]
    } else {
        Vec::new()
    }
}

fn check_oversized_pages(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.pages {
        if p.bytes > 500_000 {
            out.push(insight(
                &p.url,
                &format!("Oversized page ({} KB)", p.bytes / 1024),
                TYPE_PERF,
                SEV_WARN,
                "",
            ));
        }
    }
    out
}

// ---------- security ----------

fn check_mixed_content(ctx: &Ctx) -> Vec<Value> {
    let mut out = Vec::new();
    for p in ctx.html_pages {
        if !p.url.starts_with("https://") {
            continue;
        }
        let http_resources: Vec<&String> = html(p)
            .resources
            .iter()
            .filter(|r| r.starts_with("http://"))
            .collect();
        if !http_resources.is_empty() {
            out.push(insight(
                &p.url,
                &format!(
                    "Mixed content: {} http:// resource(s) on https:// page",
                    http_resources.len()
                ),
                TYPE_SEC,
                SEV_WARN,
                http_resources[0],
            ));
        }
    }
    out
}

pub fn run_all(ctx: &Ctx) -> Vec<Value> {
    let checks: &[(&'static str, fn(&Ctx) -> Vec<Value>)] = &[
        ("title_missing", check_title_missing),
        ("title_length", check_title_length),
        ("duplicate_titles", check_duplicate_titles),
        ("description_missing", check_description_missing),
        ("description_length", check_description_length),
        ("duplicate_descriptions", check_duplicate_descriptions),
        ("h1_missing", check_h1_missing),
        ("h1_multiple", check_h1_multiple),
        ("h1_length", check_h1_length),
        ("duplicate_h1s", check_duplicate_h1s),
        ("heading_hierarchy", check_heading_hierarchy),
        ("canonical_missing", check_canonical_missing),
        ("canonical_offdomain", check_canonical_offdomain),
        ("canonical_broken", check_canonical_broken),
        ("robots_meta_noindex", check_robots_meta_noindex),
        ("lang_missing", check_lang_missing),
        ("viewport_missing", check_viewport_missing),
        ("og_incomplete", check_og_incomplete),
        ("twitter_card", check_twitter_card),
        ("favicon", check_favicon),
        ("json_ld_parse_error", check_json_ld_parse_error),
        ("broken_internal_links", check_broken_internal_links),
        ("broken_external_links", check_broken_external_links),
        ("redirect_chains", check_redirect_chains),
        ("nofollow_internal_links", check_nofollow_internal_links),
        ("robots_missing", check_robots_missing),
        ("sitemap_missing", check_sitemap_missing),
        ("sitemap_not_in_robots", check_sitemap_not_in_robots),
        ("sitemap_broken_urls", check_sitemap_broken_urls),
        ("pages_missing_from_sitemap", check_pages_missing_from_sitemap),
        ("images_missing_alt", check_images_missing_alt),
        ("empty_anchor_text", check_empty_anchor_text),
        ("form_inputs_unlabeled", check_form_inputs_unlabeled),
        ("thin_content", check_thin_content),
        ("duplicate_content", check_duplicate_content),
        ("slow_pages", check_slow_pages),
        ("missing_compression", check_missing_compression),
        ("oversized_pages", check_oversized_pages),
        ("mixed_content", check_mixed_content),
    ];
    let mut out = Vec::new();
    for (name, fn_) in checks {
        let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| fn_(ctx)));
        match r {
            Ok(v) => out.extend(v),
            Err(_) => tracing::warn!("[crawler] check {name} panicked"),
        }
    }
    out
}