fix: silent relay state drift when activity bus drops events #53

Merged
hodlbod merged 1 commits from userAdityaa/caravel:relay-state-drift into master 2026-04-29 18:36:36 +00:00
4 changed files with 62 additions and 12 deletions
+35
View File
@@ -120,6 +120,10 @@ impl Billing {
pub async fn start(self) {
let mut rx = self.command.notify.subscribe();
if let Err(error) = self.reconcile_relay_subscriptions("startup").await {
tracing::error!(error = %error, "failed to reconcile relay billing state on startup");
}
loop {
match rx.recv().await {
Ok(activity) => {
@@ -129,12 +133,39 @@ impl Billing {
}
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
tracing::warn!(missed = n, "billing lagged");
if let Err(error) = self.reconcile_relay_subscriptions("lagged").await {
tracing::error!(error = %error, "failed to reconcile relay billing state after lag");
}
}
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
}
}
}
async fn reconcile_relay_subscriptions(&self, source: &str) -> Result<()> {
let relays = self.query.list_relays().await?;
if relays.is_empty() {
return Ok(());
}
tracing::info!(source, relay_count = relays.len(), "reconciling relay billing state");
for relay in relays {
if let Err(error) = self.sync_relay_subscription_for_relay(&relay).await {
tracing::error!(
source,
relay = %relay.id,
error = %error,
"failed to reconcile relay billing state"
);
}
}
Ok(())
}
async fn handle_activity(&self, activity: &Activity) -> Result<()> {
let needs_billing_sync = matches!(
activity.activity_type.as_str(),
@@ -158,6 +189,10 @@ impl Billing {
return Ok(());
};
self.sync_relay_subscription_for_relay(&relay).await
}
async fn sync_relay_subscription_for_relay(&self, relay: &Relay) -> Result<()> {
let Some(tenant) = self.query.get_tenant(&relay.tenant).await? else {
return Ok(());
};
+5 -5
View File
@@ -113,12 +113,12 @@ impl Command {
sqlx::query(
"INSERT INTO relay (
id, tenant, schema, subdomain, plan, status, sync_error,
id, tenant, schema, subdomain, plan, status, synced, sync_error,
info_name, info_icon, info_description,
policy_public_join, policy_strip_signatures,
groups_enabled, management_enabled, blossom_enabled,
livekit_enabled, push_enabled
) VALUES (?, ?, ?, ?, ?, 'active', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
) VALUES (?, ?, ?, ?, ?, 'active', 0, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
)
.bind(&relay.id)
.bind(&relay.tenant)
@@ -151,7 +151,7 @@ impl Command {
sqlx::query(
"UPDATE relay
SET tenant = ?, schema = ?, subdomain = ?, plan = ?, status = ?, sync_error = ?,
SET tenant = ?, schema = ?, subdomain = ?, plan = ?, status = ?, sync_error = ?, synced = 0,
info_name = ?, info_icon = ?, info_description = ?,
policy_public_join = ?, policy_strip_signatures = ?,
groups_enabled = ?, management_enabled = ?, blossom_enabled = ?,
@@ -203,7 +203,7 @@ impl Command {
) -> Result<()> {
let mut tx = self.pool.begin().await?;
sqlx::query("UPDATE relay SET status = ? WHERE id = ?")
sqlx::query("UPDATE relay SET status = ?, synced = 0 WHERE id = ?")
.bind(status)
.bind(relay_id)
.execute(&mut *tx)
@@ -224,7 +224,7 @@ impl Command {
pub async fn fail_relay_sync(&self, relay: &Relay, sync_error: String) -> Result<()> {
let mut tx = self.pool.begin().await?;
sqlx::query("UPDATE relay SET sync_error = ? WHERE id = ?")
sqlx::query("UPDATE relay SET synced = 0, sync_error = ? WHERE id = ?")
.bind(&sync_error)
.bind(&relay.id)
.execute(&mut *tx)
+20 -5
View File
@@ -53,8 +53,8 @@ impl Infra {
pub async fn start(self) {
let mut rx = self.command.notify.subscribe();
if let Err(e) = self.schedule_startup_retries().await {
tracing::error!(error = %e, "failed to schedule relay sync retries on startup");
if let Err(error) = self.reconcile_relay_state("startup").await {
tracing::error!(error = %error, "failed to reconcile relay state on startup");
}
loop {
@@ -66,6 +66,10 @@ impl Infra {
}
Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => {
tracing::warn!(missed = n, "infra lagged");
if let Err(error) = self.reconcile_relay_state("lagged").await {
tracing::error!(error = %error, "failed to reconcile relay state after lag");
}
}
Err(tokio::sync::broadcast::error::RecvError::Closed) => break,
}
@@ -95,11 +99,22 @@ impl Infra {
Ok(())
}
async fn schedule_startup_retries(&self) -> Result<()> {
let relays = self.query.list_relays_with_sync_error().await?;
async fn reconcile_relay_state(&self, source: &str) -> Result<()> {
let relays = self.query.list_relays_pending_sync().await?;
if relays.is_empty() {
return Ok(());
}
tracing::info!(source, relay_count = relays.len(), "reconciling pending relay state");
for relay in relays {
self.schedule_relay_sync_retry(&relay.id, "startup").await?;
if relay.sync_error.trim().is_empty() {
let is_new = relay.synced == 0;
self.sync_and_report(&relay, is_new).await;
} else {
self.schedule_relay_sync_retry(&relay.id, source).await?;
}
}
Ok(())
+2 -2
View File
@@ -94,7 +94,7 @@ impl Query {
Ok(rows)
}
pub async fn list_relays_with_sync_error(&self) -> Result<Vec<Relay>> {
pub async fn list_relays_pending_sync(&self) -> Result<Vec<Relay>> {
let rows = sqlx::query_as::<_, Relay>(
"SELECT id, tenant, schema, subdomain, plan, stripe_subscription_item_id,
status, sync_error,
@@ -103,7 +103,7 @@ impl Query {
groups_enabled, management_enabled, blossom_enabled,
livekit_enabled, push_enabled, synced
FROM relay
WHERE TRIM(sync_error) != ''
WHERE synced = 0 OR TRIM(sync_error) != ''
ORDER BY id",
)
.fetch_all(&self.pool)