16.7 KB
raw
//! Thin wrappers over `gix` that pre-shape data into the structs templates
//! want. Every function here is synchronous and blocking; routes call them
//! inside `tokio::task::spawn_blocking` if they're touching anything heavier
//! than HEAD metadata.
use anyhow::{anyhow, Context, Result};
use serde::Serialize;
use std::path::{Path, PathBuf};
/// Suffix used by the bare repos we expose. `/srv/git/foo.git/` is the
/// canonical layout; anything without `.git` is ignored on discovery so a
/// stray directory doesn't surface as a repo.
pub const BARE_SUFFIX: &str = ".git";
/// Maximum blob size we'll load into memory for /blob and /raw views.
/// Anything larger is almost certainly a binary asset best fetched via
/// `git clone`; serving it inline would risk OOMing the container under
/// concurrent requests.
pub const MAX_BLOB_SIZE: u64 = 25 * 1024 * 1024;
#[derive(Debug, Clone, Serialize)]
pub struct RepoSummary {
pub name: String,
pub description: String,
pub default_branch: String,
pub head_summary: Option<String>,
pub head_time: Option<i64>,
pub head_id: Option<String>,
pub clone_url: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct CommitInfo {
pub id: String,
pub short_id: String,
pub summary: String,
pub message: String,
pub author: String,
pub author_email: String,
pub time: i64,
pub parents: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct TreeEntry {
pub name: String,
pub kind: &'static str, // "tree" | "blob" | "commit" (submodule) | "link"
pub mode: String,
pub size: Option<u64>,
pub id: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct BlobInfo {
pub data: Vec<u8>,
pub size: u64,
pub is_binary: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct FileDiff {
pub path: String,
pub old_path: Option<String>,
pub status: &'static str, // "added" | "deleted" | "modified" | "renamed"
pub hunks: Vec<DiffHunk>,
pub is_binary: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct DiffHunk {
pub header: String,
pub lines: Vec<DiffLine>,
}
#[derive(Debug, Clone, Serialize)]
pub struct DiffLine {
pub kind: &'static str, // "add" | "del" | "ctx" | "hdr"
pub text: String,
}
/// Walk `root` once and return one entry per `*.git` directory that contains
/// a HEAD ref. Sorted by most-recently-touched HEAD first so the landing page
/// reads "what's been active" without further sorting in the template.
pub fn discover(root: &Path, clone_base: &str) -> Result<Vec<RepoSummary>> {
let mut out = Vec::new();
let entries = match std::fs::read_dir(root) {
Ok(e) => e,
Err(e) => {
tracing::warn!("repo root {}: {}", root.display(), e);
return Ok(out);
}
};
for entry in entries.flatten() {
let path = entry.path();
let Some(name) = path.file_name().and_then(|s| s.to_str()) else {
continue;
};
if !name.ends_with(BARE_SUFFIX) {
continue;
}
if !path.is_dir() {
continue;
}
match repo_summary(&path, clone_base) {
Ok(s) => out.push(s),
Err(e) => tracing::warn!("skipping {}: {:#}", path.display(), e),
}
}
out.sort_by(|a, b| b.head_time.cmp(&a.head_time).then(a.name.cmp(&b.name)));
Ok(out)
}
fn short_name(path: &Path) -> String {
let name = path
.file_name()
.and_then(|s| s.to_str())
.unwrap_or_default();
name.strip_suffix(BARE_SUFFIX).unwrap_or(name).to_string()
}
pub fn open(repo_root: &Path, name: &str) -> Result<gix::Repository> {
let path = resolve_path(repo_root, name)?;
Ok(gix::open(&path).with_context(|| format!("open {}", path.display()))?)
}
/// Resolve `name` (the URL slug, e.g. `analytics` or `blog.bythewood.me`)
/// to the on-disk bare repo path. Rejects any name containing a path
/// separator or starting with a dot so a request for `../etc/passwd` can't
/// escape the repo root.
pub fn resolve_path(repo_root: &Path, name: &str) -> Result<PathBuf> {
if name.is_empty()
|| name.contains('/')
|| name.contains('\\')
|| name.starts_with('.')
|| name.contains("..")
{
return Err(anyhow!("invalid repo name: {name}"));
}
// Append `.git` to the FULL name, not via Path::set_extension — that
// would clobber the existing extension on names like `blog.bythewood.me`
// (giving `blog.bythewood.git`, which doesn't exist).
let p = repo_root.join(format!("{name}.git"));
if !p.exists() {
return Err(anyhow!("repo not found: {name}"));
}
Ok(p)
}
pub fn repo_summary(path: &Path, clone_base: &str) -> Result<RepoSummary> {
let name = short_name(path);
let repo = gix::open(path).with_context(|| format!("open {}", path.display()))?;
let description = read_description(path);
let default_branch = read_default_branch(&repo);
let clone_url = format!(
"{}/{}.git",
clone_base.trim_end_matches('/'),
name
);
let head = match repo.head_commit() {
Ok(c) => Some(c),
Err(_) => None,
};
let (head_summary, head_time, head_id) = if let Some(c) = head {
let id = c.id().to_string();
let summary = c
.message()
.ok()
.and_then(|m| Some(m.summary().to_string()))
.unwrap_or_default();
let time = c.time().ok().map(|t| t.seconds);
(Some(summary), time, Some(id))
} else {
(None, None, None)
};
Ok(RepoSummary {
name,
description,
default_branch,
head_summary,
head_time,
head_id,
clone_url,
})
}
/// Read git's per-repo `description` file. The default text shipped by git
/// (`Unnamed repository; edit this file 'description' to name the repository.`)
/// is treated as empty so the landing page doesn't show that boilerplate.
fn read_description(path: &Path) -> String {
let raw = std::fs::read_to_string(path.join("description")).unwrap_or_default();
let trimmed = raw.trim();
if trimmed.starts_with("Unnamed repository") {
String::new()
} else {
trimmed.to_string()
}
}
fn read_default_branch(repo: &gix::Repository) -> String {
// HEAD is a symbolic ref like `refs/heads/master`; strip the prefix.
if let Ok(head) = repo.head() {
if let Some(name) = head.referent_name() {
let s = name.as_bstr().to_string();
if let Some(b) = s.strip_prefix("refs/heads/") {
return b.to_string();
}
return s;
}
}
"master".to_string()
}
/// Resolve a revspec (branch name, tag, full or short oid) into a commit oid.
pub fn resolve_rev(repo: &gix::Repository, rev: &str) -> Result<gix::ObjectId> {
let id = repo
.rev_parse_single(rev)
.map_err(|e| anyhow!("revspec {rev}: {e}"))?
.detach();
let obj = repo.find_object(id)?;
let commit = obj.peel_to_kind(gix::object::Kind::Commit)?;
Ok(commit.id)
}
pub fn commit_info(repo: &gix::Repository, oid: gix::ObjectId) -> Result<CommitInfo> {
let commit = repo.find_commit(oid)?;
let id = commit.id().to_string();
let short_id = id.chars().take(8).collect();
let msg = commit.message()?;
let summary = msg.summary().to_string();
let message = String::from_utf8_lossy(commit.message_raw()?.as_ref()).to_string();
let sig = commit.author()?;
let author = sig.name.to_string();
let author_email = sig.email.to_string();
let time = sig.time.seconds;
let parents = commit.parent_ids().map(|p| p.to_string()).collect();
Ok(CommitInfo {
id,
short_id,
summary,
message,
author,
author_email,
time,
parents,
})
}
pub fn recent_commits(
repo: &gix::Repository,
start: gix::ObjectId,
limit: usize,
) -> Result<Vec<CommitInfo>> {
let mut out = Vec::with_capacity(limit);
let walk = repo.rev_walk([start]).all()?;
for info in walk.take(limit) {
let info = info?;
out.push(commit_info(repo, info.id)?);
}
Ok(out)
}
/// Walk a tree at `rev` / `path`. `path` is a `/`-joined string ("", "src",
/// "src/routes"), not pre-split.
pub fn list_tree(
repo: &gix::Repository,
rev: gix::ObjectId,
path: &str,
) -> Result<(Vec<TreeEntry>, Vec<String>)> {
let commit = repo.find_commit(rev)?;
let mut tree = commit.tree()?;
let breadcrumb: Vec<String> = path
.split('/')
.filter(|p| !p.is_empty())
.map(|s| s.to_string())
.collect();
if !breadcrumb.is_empty() {
let entry = tree
.peel_to_entry_by_path(std::path::PathBuf::from(path))?
.ok_or_else(|| anyhow!("path not found: {path}"))?;
if !entry.mode().is_tree() {
return Err(anyhow!("not a tree: {path}"));
}
tree = entry.object()?.try_into_tree().map_err(|_| anyhow!("not a tree: {path}"))?;
}
let mut entries = Vec::new();
for entry_ref in tree.iter() {
let entry_ref = entry_ref?;
let name = entry_ref.filename().to_string();
let mode = entry_ref.mode();
let kind = if mode.is_tree() {
"tree"
} else if mode.is_link() {
"link"
} else if mode.is_commit() {
"commit"
} else {
"blob"
};
let size = if kind == "blob" {
repo.find_object(entry_ref.oid())
.ok()
.and_then(|o| o.try_into_blob().ok())
.map(|b| b.data.len() as u64)
} else {
None
};
entries.push(TreeEntry {
name,
kind,
mode: format!("{:o}", *mode),
size,
id: entry_ref.oid().to_string(),
});
}
// Trees first, then alphabetical within each group.
entries.sort_by(|a, b| {
let group = |k| if k == "tree" { 0 } else { 1 };
group(a.kind)
.cmp(&group(b.kind))
.then(a.name.cmp(&b.name))
});
Ok((entries, breadcrumb))
}
pub fn read_blob(repo: &gix::Repository, rev: gix::ObjectId, path: &str) -> Result<BlobInfo> {
let commit = repo.find_commit(rev)?;
let mut tree = commit.tree()?;
if path.is_empty() {
return Err(anyhow!("empty path"));
}
let entry = tree
.peel_to_entry_by_path(std::path::PathBuf::from(path))?
.ok_or_else(|| anyhow!("path not found: {path}"))?;
// Peek the object header before loading: try_into_blob() below would
// otherwise pull the entire blob into memory, so a 500MB asset in a
// repo could OOM the container under concurrent requests.
let header = repo.find_header(entry.oid())?;
if header.size() > MAX_BLOB_SIZE {
return Err(anyhow!(
"blob too large to display ({} bytes; cap is {} bytes)",
header.size(),
MAX_BLOB_SIZE
));
}
let blob = entry
.object()?
.try_into_blob()
.map_err(|_| anyhow!("not a blob: {path}"))?;
let data = blob.data.clone();
let size = data.len() as u64;
let is_binary = looks_binary(&data);
Ok(BlobInfo { data, size, is_binary })
}
/// "Binary" detection mirrors what git itself does: a NUL byte in the first
/// 8KB. Good enough for the file-view branching (text → syntect, binary →
/// download link).
fn looks_binary(data: &[u8]) -> bool {
data.iter().take(8192).any(|&b| b == 0)
}
/// Find the README at the repo root (case-insensitive, .md / .markdown / .rst
/// / no extension). Returns the bytes if found.
pub fn read_readme(repo: &gix::Repository, rev: gix::ObjectId) -> Option<(String, Vec<u8>)> {
let commit = repo.find_commit(rev).ok()?;
let tree = commit.tree().ok()?;
let mut candidates: Vec<(String, gix::ObjectId)> = Vec::new();
for e in tree.iter() {
let Ok(e) = e else { continue };
if !e.mode().is_blob() {
continue;
}
let name = e.filename().to_string();
let lower = name.to_ascii_lowercase();
if lower == "readme"
|| lower == "readme.md"
|| lower == "readme.markdown"
|| lower == "readme.txt"
|| lower == "readme.rst"
{
candidates.push((name, e.oid().into()));
}
}
// Prefer .md, then no-extension, then anything else.
candidates.sort_by_key(|(n, _)| {
let l = n.to_ascii_lowercase();
if l.ends_with(".md") || l.ends_with(".markdown") {
0
} else if !l.contains('.') {
1
} else {
2
}
});
let (name, oid) = candidates.into_iter().next()?;
let blob = repo.find_object(oid).ok()?.try_into_blob().ok()?;
Some((name, blob.data.clone()))
}
/// Render `git show --format= --patch <oid>` for a single commit and parse
/// the unified-diff output into per-file hunks. We shell out instead of
/// driving `gix-diff` directly: git is the canonical implementation of
/// unified diff and it's already on the runtime image (we need it for
/// `git http-backend`), so the marginal cost is one extra fork per commit
/// view, not a new dependency.
pub fn diff_commit(repo_path: &Path, oid: gix::ObjectId) -> Result<Vec<FileDiff>> {
let output = std::process::Command::new("git")
.arg("-c")
// Don't octal-escape non-ASCII filenames; we parse the unified-diff
// header by string-matching `diff --git a/... b/...` and quoted paths
// would break that (and the resulting `path` would render as gibberish).
.arg("core.quotePath=false")
.arg("-C")
.arg(repo_path)
.arg("show")
.arg("--format=")
.arg("--patch")
.arg("--no-color")
.arg("-M")
.arg(oid.to_string())
.output()
.context("spawn git show")?;
if !output.status.success() {
return Err(anyhow!(
"git show exited {:?}: {}",
output.status.code(),
String::from_utf8_lossy(&output.stderr)
));
}
Ok(parse_unified_diff(&String::from_utf8_lossy(&output.stdout)))
}
fn parse_unified_diff(text: &str) -> Vec<FileDiff> {
let mut files: Vec<FileDiff> = Vec::new();
let mut current_file: Option<FileDiff> = None;
let mut current_hunk: Option<DiffHunk> = None;
let flush_hunk = |file: &mut FileDiff, hunk: &mut Option<DiffHunk>| {
if let Some(h) = hunk.take() {
file.hunks.push(h);
}
};
for line in text.lines() {
if let Some(rest) = line.strip_prefix("diff --git ") {
if let Some(mut f) = current_file.take() {
flush_hunk(&mut f, &mut current_hunk);
files.push(f);
}
// `diff --git a/foo b/foo`: take the b-path (post-rename side).
let parts: Vec<&str> = rest.split(' ').collect();
let new_path = parts
.last()
.map(|s| s.trim_start_matches("b/").to_string())
.unwrap_or_default();
current_file = Some(FileDiff {
path: new_path,
old_path: None,
status: "modified",
hunks: Vec::new(),
is_binary: false,
});
continue;
}
let Some(file) = current_file.as_mut() else { continue };
if line.starts_with("new file mode") {
file.status = "added";
} else if line.starts_with("deleted file mode") {
file.status = "deleted";
} else if line.starts_with("rename from ") {
file.status = "renamed";
file.old_path = Some(line.trim_start_matches("rename from ").to_string());
} else if line.starts_with("Binary files ") {
file.is_binary = true;
} else if line.starts_with("@@") {
flush_hunk(file, &mut current_hunk);
current_hunk = Some(DiffHunk {
header: line.to_string(),
lines: Vec::new(),
});
} else if let Some(hunk) = current_hunk.as_mut() {
let (kind, body): (&'static str, &str) = if let Some(rest) = line.strip_prefix('+') {
if line.starts_with("+++") {
continue;
}
("add", rest)
} else if let Some(rest) = line.strip_prefix('-') {
if line.starts_with("---") {
continue;
}
("del", rest)
} else if let Some(rest) = line.strip_prefix(' ') {
("ctx", rest)
} else {
continue;
};
hunk.lines.push(DiffLine {
kind,
text: body.to_string(),
});
}
}
if let Some(mut f) = current_file.take() {
if let Some(h) = current_hunk.take() {
f.hunks.push(h);
}
files.push(f);
}
files
}