heartwood every commit a ring

Commit alert state before firing notifications

cb37029c by Isaac Bythewood · 24 days ago

Commit alert state before firing notifications

The previous ordering sent the email inside the atomic block before
locked.save() ran. When SQLite returned "database is locked" from a
concurrent scheduler writer, the save rolled back but the email had
already been queued — so every subsequent failed check re-entered the
up->down branch and re-fired the alert.

Save first, then notify after the transaction exits. If the save
raises, nothing is emailed and the next cycle retries from the same
state. Also enable WAL mode, a 30s busy timeout, and IMMEDIATE
transactions on the production SQLite config so concurrent scheduler
writers actually serialize instead of racing into locked errors.
modified properties/models.py
@@ -182,34 +182,40 @@ class AlertsMixin:        """        is_currently_up = current_status_code == 200        # Lock the property row so concurrent checks can't both observe the        # same alert_state and double-fire transitions.        # Commit the state transition inside the atomic block BEFORE firing        # notifications. If the save raises (e.g. SQLite "database is locked"        # from a concurrent writer), the transaction rolls back and nothing        # is emailed — the next check will retry from the same state. Sending        # first would mean a failed save leaves us firing the same alert on        # every subsequent check.        transition = None        with transaction.atomic():            locked = Property.objects.select_for_update().get(pk=self.pk)            if is_currently_up and locked.alert_state == "down":                self.send_recovery_email()                self.send_recovery_discord_message()                locked.alert_state = "up"                locked.last_alert_sent = timezone.now()                locked.save(update_fields=["alert_state", "last_alert_sent"])                self.alert_state = locked.alert_state                self.last_alert_sent = locked.last_alert_sent                transition = "recovery"            elif not is_currently_up and locked.alert_state == "up":                # Require at least 2 consecutive failures to avoid false positives.                checks = self.statuses.order_by("-created_at")[:2]                if (                    len(checks) >= 2                    and checks[0].status_code != 200                    and checks[1].status_code != 200                ):                    self.send_down_email()                    self.send_down_discord_message()                    locked.alert_state = "down"                    locked.last_alert_sent = timezone.now()                    locked.save(update_fields=["alert_state", "last_alert_sent"])                    self.alert_state = locked.alert_state                    self.last_alert_sent = locked.last_alert_sent                    transition = "down"            if transition is not None:                locked.alert_state = "up" if transition == "recovery" else "down"                locked.last_alert_sent = timezone.now()                locked.save(update_fields=["alert_state", "last_alert_sent"])                self.alert_state = locked.alert_state                self.last_alert_sent = locked.last_alert_sent        if transition == "recovery":            self.send_recovery_email()            self.send_recovery_discord_message()        elif transition == "down":            self.send_down_email()            self.send_down_discord_message()class CrawlerMixin:
modified status/settings/production.py
@@ -48,6 +48,16 @@ DATABASES = {    "default": {        "ENGINE": "django.db.backends.sqlite3",        "NAME": "/data/db/db.sqlite3",        "OPTIONS": {            # WAL lets readers run concurrently with a writer; the scheduler            # has several worker threads, so the default rollback journal            # yields frequent "database is locked" errors. A 30s busy timeout            # gives contending writers a chance to serialize rather than            # fail, which previously stranded alert state mid-transition.            "timeout": 30,            "init_command": "PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;",            "transaction_mode": "IMMEDIATE",        },    }}