9.0 KB
raw
use anyhow::Result;
use scraper::{Html, Selector};
use serde::Serialize;
use sha2::{Digest, Sha256};
use std::collections::{BTreeMap, HashSet};
use url::Url;
#[derive(Debug, Clone, Serialize)]
pub struct ParsedHtml {
pub title: String,
pub description: String,
pub canonical: String,
pub robots_meta: String,
pub viewport: String,
pub lang: String,
pub og: Og,
pub twitter: Twitter,
pub headings: BTreeMap<String, Vec<String>>,
pub links: Vec<Link>,
pub images: Vec<Image>,
pub resources: Vec<String>,
pub json_ld: Vec<serde_json::Value>,
pub favicon: String,
pub forms: Vec<Form>,
pub word_count: usize,
pub text_hash: String,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct Og {
pub title: String,
pub description: String,
pub image: String,
pub url: String,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct Twitter {
pub card: String,
pub title: String,
pub description: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct Link {
pub url: String,
pub text: String,
pub rel: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct Image {
pub src: String,
/// `None` = attribute absent (a11y violation); `Some("")` = explicitly empty (decorative).
pub alt: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct Form {
pub action: String,
pub inputs: Vec<FormInput>,
pub label_fors: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct FormInput {
pub r#type: String,
pub name: Option<String>,
pub id: Option<String>,
pub aria_label: Option<String>,
}
fn meta_by(doc: &Html, attr: &str, val: &str) -> String {
// CSS selectors don't support arbitrary attribute matching with spaces, so
// build the selector string by hand and let scraper parse it.
let sel = Selector::parse(&format!(r#"meta[{attr}="{val}"]"#)).ok();
if let Some(s) = sel {
if let Some(el) = doc.select(&s).next() {
return el.value().attr("content").unwrap_or("").trim().to_string();
}
}
String::new()
}
fn join(base: &Url, rel: &str) -> String {
base.join(rel)
.map(|u| u.to_string())
.unwrap_or_else(|_| rel.to_string())
}
pub fn parse_html(body: &[u8], page_url: &str) -> Result<ParsedHtml> {
let body_str = String::from_utf8_lossy(body);
let doc = Html::parse_document(&body_str);
let base = Url::parse(page_url)?;
let title = {
let s = Selector::parse("title").unwrap();
doc.select(&s)
.next()
.map(|t| t.text().collect::<String>().trim().to_string())
.unwrap_or_default()
};
let description = meta_by(&doc, "name", "description");
let robots_meta = meta_by(&doc, "name", "robots");
let viewport = meta_by(&doc, "name", "viewport");
let canonical = {
let s = Selector::parse(r#"link[rel="canonical"]"#).unwrap();
doc.select(&s)
.next()
.and_then(|el| el.value().attr("href"))
.map(|h| join(&base, h.trim()))
.unwrap_or_default()
};
let og = Og {
title: meta_by(&doc, "property", "og:title"),
description: meta_by(&doc, "property", "og:description"),
image: meta_by(&doc, "property", "og:image"),
url: meta_by(&doc, "property", "og:url"),
};
let twitter = Twitter {
card: meta_by(&doc, "name", "twitter:card"),
title: meta_by(&doc, "name", "twitter:title"),
description: meta_by(&doc, "name", "twitter:description"),
};
let lang = {
let s = Selector::parse("html").unwrap();
doc.select(&s)
.next()
.and_then(|el| el.value().attr("lang"))
.unwrap_or("")
.trim()
.to_string()
};
let mut headings: BTreeMap<String, Vec<String>> = BTreeMap::new();
for level in 1..=6u8 {
let key = format!("h{level}");
let s = Selector::parse(&key).unwrap();
let v: Vec<String> = doc
.select(&s)
.map(|el| el.text().collect::<Vec<_>>().join(" ").split_whitespace().collect::<Vec<_>>().join(" "))
.collect();
headings.insert(key, v);
}
let mut links: Vec<Link> = Vec::new();
let s = Selector::parse("a[href]").unwrap();
for a in doc.select(&s) {
let href = a.value().attr("href").unwrap_or("").trim();
if href.is_empty()
|| href.starts_with("javascript:")
|| href.starts_with("mailto:")
|| href.starts_with("tel:")
|| href.starts_with('#')
{
continue;
}
let text = a
.text()
.collect::<Vec<_>>()
.join(" ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let rel: Vec<String> = a
.value()
.attr("rel")
.map(|s| s.split_whitespace().map(|t| t.to_string()).collect())
.unwrap_or_default();
links.push(Link {
url: join(&base, href),
text,
rel,
});
}
let mut images: Vec<Image> = Vec::new();
let s = Selector::parse("img").unwrap();
for img in doc.select(&s) {
let src = img.value().attr("src").unwrap_or("").trim();
let alt = img.value().attr("alt").map(|s| s.to_string());
images.push(Image {
src: if src.is_empty() { String::new() } else { join(&base, src) },
alt,
});
}
let mut resources: Vec<String> = Vec::new();
let s = Selector::parse("script, link, img, iframe, source").unwrap();
for el in doc.select(&s) {
let src = el
.value()
.attr("src")
.or_else(|| el.value().attr("href"))
.unwrap_or("")
.trim();
if !src.is_empty() {
resources.push(join(&base, src));
}
}
let mut json_ld: Vec<serde_json::Value> = Vec::new();
let s = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
for sc in doc.select(&s) {
let raw = sc.text().collect::<String>();
if raw.trim().is_empty() {
continue;
}
match serde_json::from_str::<serde_json::Value>(&raw) {
Ok(v) => json_ld.push(v),
Err(_) => json_ld.push(serde_json::Value::Null),
}
}
let mut favicon = String::new();
let s = Selector::parse("link[rel]").unwrap();
for el in doc.select(&s) {
let rels = el.value().attr("rel").unwrap_or("");
if rels.split_whitespace().any(|r| r.to_lowercase().contains("icon")) {
let href = el.value().attr("href").unwrap_or("").trim();
if !href.is_empty() {
favicon = join(&base, href);
break;
}
}
}
let mut forms: Vec<Form> = Vec::new();
let s_form = Selector::parse("form").unwrap();
let s_input = Selector::parse("input, textarea, select").unwrap();
let s_label = Selector::parse("label[for]").unwrap();
for form in doc.select(&s_form) {
let mut inputs: Vec<FormInput> = Vec::new();
for i in form.select(&s_input) {
let v = i.value();
inputs.push(FormInput {
r#type: v.attr("type").unwrap_or("text").to_string(),
name: v.attr("name").map(String::from),
id: v.attr("id").map(String::from),
aria_label: v.attr("aria-label").map(String::from),
});
}
let mut label_fors: HashSet<String> = HashSet::new();
for lb in form.select(&s_label) {
if let Some(f) = lb.value().attr("for") {
label_fors.insert(f.to_string());
}
}
let action = form
.value()
.attr("action")
.map(|a| {
if a.is_empty() {
page_url.to_string()
} else {
join(&base, a)
}
})
.unwrap_or_else(|| page_url.to_string());
forms.push(Form {
action,
inputs,
label_fors: label_fors.into_iter().collect(),
});
}
// Visible text: drop script/style/noscript before extracting.
let strip_re = regex::Regex::new(
r"(?is)<(script|style|noscript)[^>]*>.*?</(script|style|noscript)>",
)
.unwrap();
let stripped = strip_re.replace_all(&body_str, "");
let tag_re = regex::Regex::new(r"(?is)<[^>]+>").unwrap();
let text_only = tag_re.replace_all(&stripped, " ");
let text = text_only.split_whitespace().collect::<Vec<_>>().join(" ");
let word_count = text.split_whitespace().count();
let mut h = Sha256::new();
h.update(text.as_bytes());
let text_hash = format!("{:x}", h.finalize());
Ok(ParsedHtml {
title,
description,
canonical,
robots_meta,
viewport,
lang,
og,
twitter,
headings,
links,
images,
resources,
json_ld,
favicon,
forms,
word_count,
text_hash,
})
}