use anyhow::Result;
use scraper::{Html, Selector};
use serde::Serialize;
use sha2::{Digest, Sha256};
use std::collections::{BTreeMap, HashSet};
use url::Url;

#[derive(Debug, Clone, Serialize)]
pub struct ParsedHtml {
    pub title: String,
    pub description: String,
    pub canonical: String,
    pub robots_meta: String,
    pub viewport: String,
    pub lang: String,
    pub og: Og,
    pub twitter: Twitter,
    pub headings: BTreeMap<String, Vec<String>>,
    pub links: Vec<Link>,
    pub images: Vec<Image>,
    pub resources: Vec<String>,
    pub json_ld: Vec<serde_json::Value>,
    pub favicon: String,
    pub forms: Vec<Form>,
    pub word_count: usize,
    pub text_hash: String,
}

#[derive(Debug, Clone, Default, Serialize)]
pub struct Og {
    pub title: String,
    pub description: String,
    pub image: String,
    pub url: String,
}

#[derive(Debug, Clone, Default, Serialize)]
pub struct Twitter {
    pub card: String,
    pub title: String,
    pub description: String,
}

#[derive(Debug, Clone, Serialize)]
pub struct Link {
    pub url: String,
    pub text: String,
    pub rel: Vec<String>,
}

#[derive(Debug, Clone, Serialize)]
pub struct Image {
    pub src: String,
    /// `None` = attribute absent (a11y violation); `Some("")` = explicitly empty (decorative).
    pub alt: Option<String>,
}

#[derive(Debug, Clone, Serialize)]
pub struct Form {
    pub action: String,
    pub inputs: Vec<FormInput>,
    pub label_fors: Vec<String>,
}

#[derive(Debug, Clone, Serialize)]
pub struct FormInput {
    pub r#type: String,
    pub name: Option<String>,
    pub id: Option<String>,
    pub aria_label: Option<String>,
}

fn meta_by(doc: &Html, attr: &str, val: &str) -> String {
    // CSS selectors don't support arbitrary attribute matching with spaces, so
    // build the selector string by hand and let scraper parse it.
    let sel = Selector::parse(&format!(r#"meta[{attr}="{val}"]"#)).ok();
    if let Some(s) = sel {
        if let Some(el) = doc.select(&s).next() {
            return el.value().attr("content").unwrap_or("").trim().to_string();
        }
    }
    String::new()
}

fn join(base: &Url, rel: &str) -> String {
    base.join(rel)
        .map(|u| u.to_string())
        .unwrap_or_else(|_| rel.to_string())
}

pub fn parse_html(body: &[u8], page_url: &str) -> Result<ParsedHtml> {
    let body_str = String::from_utf8_lossy(body);
    let doc = Html::parse_document(&body_str);
    let base = Url::parse(page_url)?;

    let title = {
        let s = Selector::parse("title").unwrap();
        doc.select(&s)
            .next()
            .map(|t| t.text().collect::<String>().trim().to_string())
            .unwrap_or_default()
    };

    let description = meta_by(&doc, "name", "description");
    let robots_meta = meta_by(&doc, "name", "robots");
    let viewport = meta_by(&doc, "name", "viewport");

    let canonical = {
        let s = Selector::parse(r#"link[rel="canonical"]"#).unwrap();
        doc.select(&s)
            .next()
            .and_then(|el| el.value().attr("href"))
            .map(|h| join(&base, h.trim()))
            .unwrap_or_default()
    };

    let og = Og {
        title: meta_by(&doc, "property", "og:title"),
        description: meta_by(&doc, "property", "og:description"),
        image: meta_by(&doc, "property", "og:image"),
        url: meta_by(&doc, "property", "og:url"),
    };

    let twitter = Twitter {
        card: meta_by(&doc, "name", "twitter:card"),
        title: meta_by(&doc, "name", "twitter:title"),
        description: meta_by(&doc, "name", "twitter:description"),
    };

    let lang = {
        let s = Selector::parse("html").unwrap();
        doc.select(&s)
            .next()
            .and_then(|el| el.value().attr("lang"))
            .unwrap_or("")
            .trim()
            .to_string()
    };

    let mut headings: BTreeMap<String, Vec<String>> = BTreeMap::new();
    for level in 1..=6u8 {
        let key = format!("h{level}");
        let s = Selector::parse(&key).unwrap();
        let v: Vec<String> = doc
            .select(&s)
            .map(|el| el.text().collect::<Vec<_>>().join(" ").split_whitespace().collect::<Vec<_>>().join(" "))
            .collect();
        headings.insert(key, v);
    }

    let mut links: Vec<Link> = Vec::new();
    let s = Selector::parse("a[href]").unwrap();
    for a in doc.select(&s) {
        let href = a.value().attr("href").unwrap_or("").trim();
        if href.is_empty()
            || href.starts_with("javascript:")
            || href.starts_with("mailto:")
            || href.starts_with("tel:")
            || href.starts_with('#')
        {
            continue;
        }
        let text = a
            .text()
            .collect::<Vec<_>>()
            .join(" ")
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ");
        let rel: Vec<String> = a
            .value()
            .attr("rel")
            .map(|s| s.split_whitespace().map(|t| t.to_string()).collect())
            .unwrap_or_default();
        links.push(Link {
            url: join(&base, href),
            text,
            rel,
        });
    }

    let mut images: Vec<Image> = Vec::new();
    let s = Selector::parse("img").unwrap();
    for img in doc.select(&s) {
        let src = img.value().attr("src").unwrap_or("").trim();
        let alt = img.value().attr("alt").map(|s| s.to_string());
        images.push(Image {
            src: if src.is_empty() { String::new() } else { join(&base, src) },
            alt,
        });
    }

    let mut resources: Vec<String> = Vec::new();
    let s = Selector::parse("script, link, img, iframe, source").unwrap();
    for el in doc.select(&s) {
        let src = el
            .value()
            .attr("src")
            .or_else(|| el.value().attr("href"))
            .unwrap_or("")
            .trim();
        if !src.is_empty() {
            resources.push(join(&base, src));
        }
    }

    let mut json_ld: Vec<serde_json::Value> = Vec::new();
    let s = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
    for sc in doc.select(&s) {
        let raw = sc.text().collect::<String>();
        if raw.trim().is_empty() {
            continue;
        }
        match serde_json::from_str::<serde_json::Value>(&raw) {
            Ok(v) => json_ld.push(v),
            Err(_) => json_ld.push(serde_json::Value::Null),
        }
    }

    let mut favicon = String::new();
    let s = Selector::parse("link[rel]").unwrap();
    for el in doc.select(&s) {
        let rels = el.value().attr("rel").unwrap_or("");
        if rels.split_whitespace().any(|r| r.to_lowercase().contains("icon")) {
            let href = el.value().attr("href").unwrap_or("").trim();
            if !href.is_empty() {
                favicon = join(&base, href);
                break;
            }
        }
    }

    let mut forms: Vec<Form> = Vec::new();
    let s_form = Selector::parse("form").unwrap();
    let s_input = Selector::parse("input, textarea, select").unwrap();
    let s_label = Selector::parse("label[for]").unwrap();
    for form in doc.select(&s_form) {
        let mut inputs: Vec<FormInput> = Vec::new();
        for i in form.select(&s_input) {
            let v = i.value();
            inputs.push(FormInput {
                r#type: v.attr("type").unwrap_or("text").to_string(),
                name: v.attr("name").map(String::from),
                id: v.attr("id").map(String::from),
                aria_label: v.attr("aria-label").map(String::from),
            });
        }
        let mut label_fors: HashSet<String> = HashSet::new();
        for lb in form.select(&s_label) {
            if let Some(f) = lb.value().attr("for") {
                label_fors.insert(f.to_string());
            }
        }
        let action = form
            .value()
            .attr("action")
            .map(|a| {
                if a.is_empty() {
                    page_url.to_string()
                } else {
                    join(&base, a)
                }
            })
            .unwrap_or_else(|| page_url.to_string());
        forms.push(Form {
            action,
            inputs,
            label_fors: label_fors.into_iter().collect(),
        });
    }

    // Visible text: drop script/style/noscript before extracting.
    let strip_re = regex::Regex::new(
        r"(?is)<(script|style|noscript)[^>]*>.*?</(script|style|noscript)>",
    )
    .unwrap();
    let stripped = strip_re.replace_all(&body_str, "");
    let tag_re = regex::Regex::new(r"(?is)<[^>]+>").unwrap();
    let text_only = tag_re.replace_all(&stripped, " ");
    let text = text_only.split_whitespace().collect::<Vec<_>>().join(" ");
    let word_count = text.split_whitespace().count();
    let mut h = Sha256::new();
    h.update(text.as_bytes());
    let text_hash = format!("{:x}", h.finalize());

    Ok(ParsedHtml {
        title,
        description,
        canonical,
        robots_meta,
        viewport,
        lang,
        og,
        twitter,
        headings,
        links,
        images,
        resources,
        json_ld,
        favicon,
        forms,
        word_count,
        text_hash,
    })
}
