24.3 KB
raw
//! SEO / accessibility / performance / content / security checks.
//!
//! Each check takes a `Ctx` and returns a list of insight values
//! (`{url, issue, item, type, severity}` JSON objects). Direct port of
//! `crawler/checks.py`.
use super::{fetcher::same_site, Page, RobotsCtx};
use serde_json::{json, Value};
use std::collections::{HashMap, HashSet};
const TYPE_SEO: &str = "seo";
const TYPE_LINKS: &str = "links";
const TYPE_A11Y: &str = "accessibility";
const TYPE_CONTENT: &str = "content";
const TYPE_PERF: &str = "performance";
const TYPE_SEC: &str = "security";
const SEV_ERROR: &str = "error";
const SEV_WARN: &str = "warning";
const SEV_INFO: &str = "info";
pub struct Ctx<'a> {
pub start_url: &'a str,
pub host: &'a str,
pub pages: &'a [Page],
pub html_pages: &'a [&'a Page],
pub status_map: &'a HashMap<String, u16>,
pub external_link_status: &'a HashMap<String, u16>,
pub sitemap_urls: &'a [String],
pub robots: &'a RobotsCtx,
/// Server's `Content-Encoding` for `start_url`, lowercased. `None` means
/// the response was uncompressed.
pub compression: Option<&'a str>,
}
fn redirect_codes() -> HashSet<u16> {
[301, 302, 303, 307, 308].into_iter().collect()
}
fn insight(url: &str, issue: &str, type_: &str, severity: &str, item: &str) -> Value {
json!({
"url": url,
"issue": issue,
"item": item,
"type": type_,
"severity": severity,
})
}
fn normalize(s: &str) -> String {
s.to_lowercase().split_whitespace().collect::<Vec<_>>().join(" ")
}
fn group_by<'a, F: Fn(&Page) -> String>(
pages: &[&'a Page],
f: F,
) -> HashMap<String, Vec<&'a Page>> {
let mut out: HashMap<String, Vec<&'a Page>> = HashMap::new();
for p in pages {
let v = f(p);
if !v.is_empty() {
out.entry(normalize(&v)).or_default().push(p);
}
}
out
}
fn html<'a>(p: &'a Page) -> &'a super::parser::ParsedHtml {
p.html.as_ref().expect("called on html page")
}
// ---------- core metadata ----------
fn check_title_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).title.is_empty())
.map(|p| insight(&p.url, "Page has no title", TYPE_SEO, SEV_ERROR, ""))
.collect()
}
fn check_title_length(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let t = &html(p).title;
let n = t.chars().count();
if !t.is_empty() && !(30..=60).contains(&n) {
out.push(insight(
&p.url,
&format!("Title length is {n} chars (recommended 30-60)"),
TYPE_SEO,
SEV_WARN,
t,
));
}
}
out
}
fn check_duplicate_titles(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for (_, group) in group_by(ctx.html_pages, |p| html(p).title.clone()) {
if group.len() > 1 {
for p in group {
out.push(insight(&p.url, "Duplicate title", TYPE_SEO, SEV_WARN, &html(p).title));
}
}
}
out
}
fn check_description_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).description.is_empty())
.map(|p| insight(&p.url, "Page has no meta description", TYPE_SEO, SEV_ERROR, ""))
.collect()
}
fn check_description_length(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let d = &html(p).description;
let n = d.chars().count();
if !d.is_empty() && !(70..=160).contains(&n) {
out.push(insight(
&p.url,
&format!("Description length is {n} chars (recommended 70-160)"),
TYPE_SEO,
SEV_WARN,
d,
));
}
}
out
}
fn check_duplicate_descriptions(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for (_, group) in group_by(ctx.html_pages, |p| html(p).description.clone()) {
if group.len() > 1 {
for p in group {
out.push(insight(
&p.url,
"Duplicate meta description",
TYPE_SEO,
SEV_WARN,
&html(p).description,
));
}
}
}
out
}
fn h1s<'a>(p: &'a Page) -> &'a [String] {
html(p)
.headings
.get("h1")
.map(|v| v.as_slice())
.unwrap_or(&[])
}
fn check_h1_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| h1s(p).is_empty())
.map(|p| insight(&p.url, "Page has no h1", TYPE_SEO, SEV_ERROR, ""))
.collect()
}
fn check_h1_multiple(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let v = h1s(p);
if v.len() > 1 {
let item = v.iter().take(3).cloned().collect::<Vec<_>>().join(" | ");
out.push(insight(
&p.url,
&format!("Page has {} h1 tags (expected 1)", v.len()),
TYPE_SEO,
SEV_WARN,
&item,
));
}
}
out
}
fn check_h1_length(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let v = h1s(p);
if let Some(first) = v.first() {
let n = first.chars().count();
if !(20..=70).contains(&n) {
out.push(insight(
&p.url,
&format!("H1 length is {n} chars (recommended 20-70)"),
TYPE_SEO,
SEV_WARN,
first,
));
}
}
}
out
}
fn check_duplicate_h1s(ctx: &Ctx) -> Vec<Value> {
let mut buckets: HashMap<String, Vec<(String, String)>> = HashMap::new();
for p in ctx.html_pages {
if let Some(first) = h1s(p).first() {
buckets.entry(normalize(first)).or_default().push((p.url.clone(), first.clone()));
}
}
let mut out = Vec::new();
for (_, group) in buckets {
if group.len() > 1 {
for (url, item) in group {
out.push(insight(&url, "Duplicate h1", TYPE_SEO, SEV_WARN, &item));
}
}
}
out
}
fn check_heading_hierarchy(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let h = &html(p).headings;
let levels: Vec<u8> = (1..=6u8)
.filter(|lvl| {
h.get(&format!("h{lvl}"))
.map(|v| !v.is_empty())
.unwrap_or(false)
})
.collect();
for i in 1..levels.len() {
if levels[i] - levels[i - 1] > 1 {
out.push(insight(
&p.url,
&format!("Heading hierarchy skips from h{} to h{}", levels[i - 1], levels[i]),
TYPE_SEO,
SEV_INFO,
"",
));
break;
}
}
}
out
}
fn check_canonical_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).canonical.is_empty())
.map(|p| insight(&p.url, "Page has no canonical URL", TYPE_SEO, SEV_WARN, ""))
.collect()
}
fn check_canonical_offdomain(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let c = &html(p).canonical;
if !c.is_empty() && !same_site(c, ctx.host) {
out.push(insight(&p.url, "Canonical URL points off-domain", TYPE_SEO, SEV_WARN, c));
}
}
out
}
fn check_canonical_broken(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let c = &html(p).canonical;
if !c.is_empty() {
if let Some(s) = ctx.status_map.get(c) {
if *s != 200 {
out.push(insight(
&p.url,
&format!("Canonical URL returns {s}"),
TYPE_SEO,
SEV_ERROR,
c,
));
}
}
}
}
out
}
fn check_robots_meta_noindex(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let rm = &html(p).robots_meta;
if rm.to_lowercase().contains("noindex") {
out.push(insight(
&p.url,
"Page has noindex in meta robots tag",
TYPE_SEO,
SEV_WARN,
rm,
));
}
}
out
}
fn check_lang_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).lang.is_empty())
.map(|p| insight(&p.url, "HTML lang attribute missing", TYPE_SEO, SEV_WARN, ""))
.collect()
}
fn check_viewport_missing(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).viewport.is_empty())
.map(|p| insight(&p.url, "Viewport meta tag missing (mobile)", TYPE_SEO, SEV_WARN, ""))
.collect()
}
fn check_og_incomplete(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let og = &html(p).og;
let mut missing: Vec<&str> = Vec::new();
if og.title.is_empty() { missing.push("og:title"); }
if og.description.is_empty() { missing.push("og:description"); }
if og.image.is_empty() { missing.push("og:image"); }
if og.url.is_empty() { missing.push("og:url"); }
if !missing.is_empty() {
out.push(insight(
&p.url,
&format!("Open Graph tags missing: {}", missing.join(", ")),
TYPE_SEO,
SEV_INFO,
"",
));
}
}
out
}
fn check_twitter_card(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).twitter.card.is_empty())
.map(|p| insight(&p.url, "Twitter card meta tag missing", TYPE_SEO, SEV_INFO, ""))
.collect()
}
fn check_favicon(ctx: &Ctx) -> Vec<Value> {
ctx.html_pages
.iter()
.filter(|p| html(p).favicon.is_empty())
.map(|p| insight(&p.url, "Favicon link missing", TYPE_SEO, SEV_INFO, ""))
.collect()
}
fn check_json_ld_parse_error(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
for v in &html(p).json_ld {
if v.is_null() {
out.push(insight(
&p.url,
"JSON-LD structured data failed to parse",
TYPE_SEO,
SEV_WARN,
"",
));
break;
}
}
}
out
}
// ---------- links ----------
fn check_broken_internal_links(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
let mut reported: HashSet<(String, String)> = HashSet::new();
let redirects = redirect_codes();
for p in ctx.html_pages {
for link in &html(p).links {
if !same_site(&link.url, ctx.host) {
continue;
}
let Some(s) = ctx.status_map.get(&link.url) else { continue };
if *s != 200 && !redirects.contains(s) {
let key = (p.url.clone(), link.url.clone());
if !reported.insert(key) {
continue;
}
let label = if *s == 0 { "unreachable".to_string() } else { format!("status {s}") };
out.push(insight(
&p.url,
&format!("Broken internal link ({label})"),
TYPE_LINKS,
SEV_ERROR,
&link.url,
));
}
}
}
out
}
fn check_broken_external_links(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
let mut reported: HashSet<(String, String)> = HashSet::new();
for p in ctx.html_pages {
for link in &html(p).links {
if same_site(&link.url, ctx.host) {
continue;
}
let Some(s) = ctx.external_link_status.get(&link.url) else { continue };
if *s == 0 || *s >= 400 {
let key = (p.url.clone(), link.url.clone());
if !reported.insert(key) {
continue;
}
let label = if *s == 0 { "unreachable".to_string() } else { format!("status {s}") };
out.push(insight(
&p.url,
&format!("Broken external link ({label})"),
TYPE_LINKS,
SEV_WARN,
&link.url,
));
}
}
}
out
}
fn check_redirect_chains(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.pages {
if p.redirect_chain.len() > 2 {
let hops = p.redirect_chain.len() - 1;
let codes = p
.redirect_chain
.iter()
.map(|(c, _)| c.to_string())
.collect::<Vec<_>>()
.join(" -> ");
out.push(insight(
&p.url,
&format!("Redirect chain has {hops} hops"),
TYPE_LINKS,
SEV_INFO,
&codes,
));
}
}
out
}
fn check_nofollow_internal_links(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
let mut reported: HashSet<(String, String)> = HashSet::new();
for p in ctx.html_pages {
for link in &html(p).links {
if !same_site(&link.url, ctx.host) {
continue;
}
if link.rel.iter().any(|r| r == "nofollow") {
let key = (p.url.clone(), link.url.clone());
if !reported.insert(key) {
continue;
}
out.push(insight(
&p.url,
"Internal link has rel=nofollow",
TYPE_LINKS,
SEV_INFO,
&link.url,
));
}
}
}
out
}
// ---------- robots / sitemap ----------
fn check_robots_missing(ctx: &Ctx) -> Vec<Value> {
if !ctx.robots.exists {
vec![insight(
ctx.start_url,
"robots.txt missing",
TYPE_SEO,
SEV_WARN,
&ctx.robots.url,
)]
} else {
Vec::new()
}
}
fn check_sitemap_missing(ctx: &Ctx) -> Vec<Value> {
if ctx.sitemap_urls.is_empty() {
vec![insight(
ctx.start_url,
"sitemap.xml missing or empty",
TYPE_SEO,
SEV_WARN,
"",
)]
} else {
Vec::new()
}
}
fn check_sitemap_not_in_robots(ctx: &Ctx) -> Vec<Value> {
if ctx.robots.exists && !ctx.sitemap_urls.is_empty() && !ctx.robots.references_sitemap {
vec![insight(
ctx.start_url,
"robots.txt does not reference a sitemap",
TYPE_SEO,
SEV_INFO,
"",
)]
} else {
Vec::new()
}
}
fn check_sitemap_broken_urls(ctx: &Ctx) -> Vec<Value> {
let redirects = redirect_codes();
let mut out = Vec::new();
for url in ctx.sitemap_urls {
if let Some(s) = ctx.status_map.get(url) {
if *s != 200 && !redirects.contains(s) {
out.push(insight(
url,
&format!("URL listed in sitemap returns {s}"),
TYPE_SEO,
SEV_ERROR,
"",
));
}
}
}
out
}
fn check_pages_missing_from_sitemap(ctx: &Ctx) -> Vec<Value> {
if ctx.sitemap_urls.is_empty() {
return Vec::new();
}
let set: HashSet<&str> = ctx.sitemap_urls.iter().map(|s| s.as_str()).collect();
let mut out = Vec::new();
for p in ctx.html_pages {
if set.contains(p.url.as_str()) {
continue;
}
if html(p).robots_meta.to_lowercase().contains("noindex") {
continue;
}
out.push(insight(&p.url, "Page not listed in sitemap", TYPE_SEO, SEV_INFO, ""));
}
out
}
// ---------- accessibility ----------
fn check_images_missing_alt(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let missing: Vec<_> = html(p).images.iter().filter(|i| i.alt.is_none()).collect();
if !missing.is_empty() {
let item: String = missing[0]
.src
.chars()
.take(160)
.collect();
out.push(insight(
&p.url,
&format!("{} image(s) missing alt attribute", missing.len()),
TYPE_A11Y,
SEV_WARN,
&item,
));
}
}
out
}
fn check_empty_anchor_text(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let empty: Vec<_> = html(p).links.iter().filter(|l| l.text.is_empty()).collect();
if !empty.is_empty() {
let item: String = empty[0].url.chars().take(160).collect();
out.push(insight(
&p.url,
&format!("{} link(s) have no visible text", empty.len()),
TYPE_A11Y,
SEV_INFO,
&item,
));
}
}
out
}
fn check_form_inputs_unlabeled(ctx: &Ctx) -> Vec<Value> {
let ignore: HashSet<&str> = ["hidden", "submit", "button", "reset", "image"].into_iter().collect();
let mut out = Vec::new();
'outer: for p in ctx.html_pages {
for form in &html(p).forms {
let label_set: HashSet<&str> = form.label_fors.iter().map(|s| s.as_str()).collect();
let mut unlabeled = 0;
for i in &form.inputs {
if ignore.contains(i.r#type.to_lowercase().as_str()) {
continue;
}
if i.aria_label.is_some() {
continue;
}
if let Some(id) = &i.id {
if label_set.contains(id.as_str()) {
continue;
}
}
unlabeled += 1;
}
if unlabeled > 0 {
out.push(insight(
&p.url,
&format!("{unlabeled} form input(s) without associated label"),
TYPE_A11Y,
SEV_WARN,
&form.action,
));
continue 'outer;
}
}
}
out
}
// ---------- content ----------
fn check_thin_content(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
let wc = html(p).word_count;
if wc < 300 {
out.push(insight(
&p.url,
&format!("Thin content ({wc} words)"),
TYPE_CONTENT,
SEV_WARN,
"",
));
}
}
out
}
fn check_duplicate_content(ctx: &Ctx) -> Vec<Value> {
let mut buckets: HashMap<String, Vec<String>> = HashMap::new();
for p in ctx.html_pages {
let th = &html(p).text_hash;
if !th.is_empty() {
buckets.entry(th.clone()).or_default().push(p.url.clone());
}
}
let mut out = Vec::new();
for urls in buckets.values() {
if urls.len() > 1 {
for u in urls {
let other = urls.iter().find(|x| *x != u).unwrap_or(&urls[0]);
out.push(insight(
u,
"Page has duplicate visible content with another page",
TYPE_CONTENT,
SEV_WARN,
other,
));
}
}
}
out
}
// ---------- performance ----------
fn check_slow_pages(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.pages {
if !p.is_html {
continue;
}
if p.elapsed_ms > 1000 {
out.push(insight(
&p.url,
&format!("Slow response ({} ms)", p.elapsed_ms),
TYPE_PERF,
SEV_WARN,
"",
));
}
}
out
}
fn check_missing_compression(ctx: &Ctx) -> Vec<Value> {
// Probed at the start URL with a non-decompressing client (the main
// crawler client has reqwest's gzip/brotli features on, which strip
// `Content-Encoding` after auto-decompression — so we can't read it from
// the per-page headers). Compression is a server-wide config in practice,
// so one probe is enough.
if ctx.compression.is_none() {
vec![insight(
ctx.start_url,
"Response not compressed (no Content-Encoding header)",
TYPE_PERF,
SEV_INFO,
"",
)]
} else {
Vec::new()
}
}
fn check_oversized_pages(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.pages {
if p.bytes > 500_000 {
out.push(insight(
&p.url,
&format!("Oversized page ({} KB)", p.bytes / 1024),
TYPE_PERF,
SEV_WARN,
"",
));
}
}
out
}
// ---------- security ----------
fn check_mixed_content(ctx: &Ctx) -> Vec<Value> {
let mut out = Vec::new();
for p in ctx.html_pages {
if !p.url.starts_with("https://") {
continue;
}
let http_resources: Vec<&String> = html(p)
.resources
.iter()
.filter(|r| r.starts_with("http://"))
.collect();
if !http_resources.is_empty() {
out.push(insight(
&p.url,
&format!(
"Mixed content: {} http:// resource(s) on https:// page",
http_resources.len()
),
TYPE_SEC,
SEV_WARN,
http_resources[0],
));
}
}
out
}
pub fn run_all(ctx: &Ctx) -> Vec<Value> {
let checks: &[(&'static str, fn(&Ctx) -> Vec<Value>)] = &[
("title_missing", check_title_missing),
("title_length", check_title_length),
("duplicate_titles", check_duplicate_titles),
("description_missing", check_description_missing),
("description_length", check_description_length),
("duplicate_descriptions", check_duplicate_descriptions),
("h1_missing", check_h1_missing),
("h1_multiple", check_h1_multiple),
("h1_length", check_h1_length),
("duplicate_h1s", check_duplicate_h1s),
("heading_hierarchy", check_heading_hierarchy),
("canonical_missing", check_canonical_missing),
("canonical_offdomain", check_canonical_offdomain),
("canonical_broken", check_canonical_broken),
("robots_meta_noindex", check_robots_meta_noindex),
("lang_missing", check_lang_missing),
("viewport_missing", check_viewport_missing),
("og_incomplete", check_og_incomplete),
("twitter_card", check_twitter_card),
("favicon", check_favicon),
("json_ld_parse_error", check_json_ld_parse_error),
("broken_internal_links", check_broken_internal_links),
("broken_external_links", check_broken_external_links),
("redirect_chains", check_redirect_chains),
("nofollow_internal_links", check_nofollow_internal_links),
("robots_missing", check_robots_missing),
("sitemap_missing", check_sitemap_missing),
("sitemap_not_in_robots", check_sitemap_not_in_robots),
("sitemap_broken_urls", check_sitemap_broken_urls),
("pages_missing_from_sitemap", check_pages_missing_from_sitemap),
("images_missing_alt", check_images_missing_alt),
("empty_anchor_text", check_empty_anchor_text),
("form_inputs_unlabeled", check_form_inputs_unlabeled),
("thin_content", check_thin_content),
("duplicate_content", check_duplicate_content),
("slow_pages", check_slow_pages),
("missing_compression", check_missing_compression),
("oversized_pages", check_oversized_pages),
("mixed_content", check_mixed_content),
];
let mut out = Vec::new();
for (name, fn_) in checks {
let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| fn_(ctx)));
match r {
Ok(v) => out.extend(v),
Err(_) => tracing::warn!("[crawler] check {name} panicked"),
}
}
out
}