heartwood every commit a ring

Harden Lighthouse runner and surface failures

d7b47000 by Isaac Bythewood · 25 days ago

Harden Lighthouse runner and surface failures

Lighthouse runs were silently stalling the scheduler. subprocess.run had
no timeout and the queue_process thread joined with no timeout, so one
hung Chromium wedged the lighthouse+crawler queue indefinitely while
status pings kept working in their separate queue.

Adds a 180s subprocess timeout, a 300s join timeout, and a LighthouseError
that propagates stderr context instead of being swallowed by a bare except.
Results are persisted to last_lighthouse_error / last_lighthouse_success_at
so failures are visible in the UI and logs instead of silently dropping
the day's data.
modified properties/management/commands/scheduler.py
@@ -44,10 +44,16 @@ class Command(BaseCommand):        q_status.put((property_id, property_type))    def queue_process(self):        # Cap on join() so a wedged lighthouse/crawler can't freeze the queue        # indefinitely. Must exceed status.lighthouse.SUBPROCESS_TIMEOUT_SECONDS        # so a normal slow run still completes inside the window.        JOIN_TIMEOUT = 300        while True:            if not q.empty():                threads = []                for i in range(2):                    if q.empty():                        break                    q_data = q.get()                    if q_data[1] == "lighthouse":                        t = threading.Thread(target=self.thread_target_lighthouse, args=(q_data[0],))
@@ -57,7 +63,11 @@ class Command(BaseCommand):                    t.start()                    threads.append(t)                for t in threads:                    t.join()                    t.join(timeout=JOIN_TIMEOUT)                    if t.is_alive():                        self.stdout.write(                            "[Scheduler] Thread still running after {}s, abandoning".format(JOIN_TIMEOUT)                        )                    q.task_done()            time.sleep(1)
added properties/migrations/0008_lighthouse_error_tracking.py
@@ -0,0 +1,21 @@from django.db import migrations, modelsclass Migration(migrations.Migration):    dependencies = [        ('properties', '0007_add_alert_state_tracking'),    ]    operations = [        migrations.AddField(            model_name='property',            name='last_lighthouse_success_at',            field=models.DateTimeField(blank=True, null=True),        ),        migrations.AddField(            model_name='property',            name='last_lighthouse_error',            field=models.TextField(blank=True, null=True),        ),    ]
modified properties/models.py
@@ -14,7 +14,11 @@ from django.utils.functional import cached_propertyfrom django.conf import settingsfrom crawler.runner import run_seo_spiderfrom status.lighthouse import fetch_lighthouse_results, parse_lighthouse_resultsfrom status.lighthouse import (    LighthouseError,    fetch_lighthouse_results,    parse_lighthouse_results,)logger = logging.getLogger(__name__)
@@ -360,6 +364,8 @@ class Property(CrawlerMixin, AlertsMixin, SecurityMixin, models.Model):    lighthouse_scores = models.JSONField(blank=True, null=True)    last_lighthouse_run_at = models.DateTimeField(blank=True, null=True)    last_lighthouse_success_at = models.DateTimeField(blank=True, null=True)    last_lighthouse_error = models.TextField(blank=True, null=True)    next_lighthouse_run_at = models.DateTimeField(blank=True, null=True)    # Alert state tracking
@@ -453,11 +459,27 @@ class Property(CrawlerMixin, AlertsMixin, SecurityMixin, models.Model):        try:            results = fetch_lighthouse_results(self.url)            scores = parse_lighthouse_results(results)            if scores:                self.lighthouse_scores = scores                self.save(update_fields=["lighthouse_scores"])        except Exception:            pass        except LighthouseError as e:            logger.warning("Lighthouse failed for %s: %s", self.url, e)            self.last_lighthouse_error = str(e)            self.save(update_fields=["last_lighthouse_error"])            return        except Exception as e:            logger.exception("Unexpected lighthouse error for %s", self.url)            self.last_lighthouse_error = f"{type(e).__name__}: {e}"            self.save(update_fields=["last_lighthouse_error"])            return        self.lighthouse_scores = scores        self.last_lighthouse_success_at = timezone.now()        self.last_lighthouse_error = None        self.save(            update_fields=[                "lighthouse_scores",                "last_lighthouse_success_at",                "last_lighthouse_error",            ]        )    @property    def total_checks(self):
modified properties/templates/properties/property.html
@@ -129,6 +129,18 @@</div>{% endif %}{% if property.last_lighthouse_error and user.is_authenticated %}<div class="container-fluid mb-4">  <div class="alert alert-warning mb-0" role="alert">    <strong>Last Lighthouse run failed:</strong>    <code>{{ property.last_lighthouse_error }}</code>    {% if property.last_lighthouse_run_at %}      <span class="text-muted small d-block">attempted {{ property.last_lighthouse_run_at|timesince }} ago</span>    {% endif %}  </div></div>{% endif %}<div class="container mt-4">  <div class="row">    <div class="col-12 col-md-8">
modified status/lighthouse.py
@@ -1,27 +1,75 @@"""A basic wrapper around the lighthouse node module.A wrapper around the lighthouse node CLI.Raises LighthouseError with a descriptive message on failure so callers canlog/persist the reason instead of silently dropping the result."""import jsonimport loggingimport shutilimport subprocessfrom django.conf import settingslogger = logging.getLogger(__name__)# Lighthouse's chrome-launcher searches for a browser on its own; we only# pin CHROME_PATH when we can resolve a known name, for determinism in# production (Alpine ships `chromium`). If nothing is found, fall through# and let chrome-launcher do its own lookup.CHROMIUM_BINARY = (    shutil.which("chromium")    or shutil.which("chromium-browser")    or shutil.which("google-chrome"))CHROME_FLAGS = "--headless --no-sandbox --disable-dev-shm-usage --disable-gpu"# Lighthouse itself can take 60-90s on a slow site; the outer timeout is a# backstop against Chromium hangs that would otherwise wedge the scheduler.SUBPROCESS_TIMEOUT_SECONDS = 180class LighthouseError(Exception):    passdef fetch_lighthouse_results(url):    command = [        f"{settings.BASE_DIR}/node_modules/.bin/lighthouse",        url,        '--chrome-flags="--headless --no-sandbox --disable-dev-shm-usage"',        '--output="json"',        '--output-path="stdout"',        '--max-wait-for-load=5000',        f"--chrome-flags={CHROME_FLAGS}",        "--output=json",        "--output-path=stdout",        "--quiet",    ]    env = {"PATH": "/usr/bin:/bin:/usr/local/bin"}    if CHROMIUM_BINARY:        env["CHROME_PATH"] = CHROMIUM_BINARY    try:        process = subprocess.run(command, check=True, stdout=subprocess.PIPE)    except subprocess.CalledProcessError:        return None    return json.loads(process.stdout)        process = subprocess.run(            command,            check=True,            stdout=subprocess.PIPE,            stderr=subprocess.PIPE,            timeout=SUBPROCESS_TIMEOUT_SECONDS,            env=env,        )    except subprocess.TimeoutExpired:        raise LighthouseError(f"lighthouse timed out after {SUBPROCESS_TIMEOUT_SECONDS}s")    except subprocess.CalledProcessError as e:        stderr = (e.stderr or b"").decode("utf-8", errors="replace").strip()        raise LighthouseError(f"lighthouse exited {e.returncode}: {stderr[-500:]}")    except FileNotFoundError as e:        raise LighthouseError(f"lighthouse binary missing: {e}")    try:        return json.loads(process.stdout)    except json.JSONDecodeError as e:        raise LighthouseError(f"could not parse lighthouse output: {e}")def parse_lighthouse_results(results):
@@ -32,7 +80,11 @@ def parse_lighthouse_results(results):            "Best practices": results["categories"]["best-practices"]["score"],            "SEO": results["categories"]["seo"]["score"],        }    except KeyError:        return None    scores = {k: round(v * 100) for k, v in scores.items()}    return scores    except KeyError as e:        raise LighthouseError(f"missing category in lighthouse output: {e}")    if any(v is None for v in scores.values()):        missing = [k for k, v in scores.items() if v is None]        raise LighthouseError(f"null score(s) returned by lighthouse: {missing}")    return {k: round(v * 100) for k, v in scores.items()}