Wire health probes, host stats, and LLM proxy round-trip

Three things from the latest arcadia-app pull:

- health.ts: client for /api/v1/health{,/:service,/detailed,/host}.
  monitoring.tsx now reads real per-subsystem probe state instead of
  synthesizing it from indirect signals (rate limits, sessions, jobs).
- New Host tab on Monitoring with KPI tiles + per-core CPU bars,
  load-avg cards, memory + swap usage, and per-mount disk bars,
  backed by /api/v1/health/host.
- llm-proxy.ts: typed errors (secret_disabled, ip_not_allowed, etc.)
  and a probeProxy() that round-trips a 1-token chat. settings.tsx's
  "Test connection" in proxy mode now exercises the real endpoint
  instead of just confirming the adapter built. Contract doc flipped
  from "not yet implemented" to "implemented".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
jules
2026-05-02 17:05:22 +10:00
parent 0fcb9e40f1
commit 29030c9e72
5 changed files with 661 additions and 46 deletions

View File

@@ -71,6 +71,15 @@ import {
type RateLimit,
type Space,
} from "~/lib/arcadia/monitoring"
import {
getHealth,
getHostStats,
SUBSYSTEMS,
type HealthSubsystem,
type HostStats,
type OverallHealth,
type SubsystemHealth,
} from "~/lib/arcadia/health"
import { pageTitle } from "~/lib/page-meta"
import { useSession } from "~/lib/session"
import { useRegisterAdminContext } from "~/lib/admin-context"
@@ -86,6 +95,8 @@ interface DashboardData {
spaces: Space[]
droplets: Droplet[]
auditStats: AuditStats | null
health: OverallHealth | null
host: HostStats | null
}
const EMPTY: DashboardData = {
@@ -97,6 +108,8 @@ const EMPTY: DashboardData = {
spaces: [],
droplets: [],
auditStats: null,
health: null,
host: null,
}
export default function MonitoringRoute() {
@@ -121,6 +134,8 @@ export default function MonitoringRoute() {
spaces,
droplets,
auditStats,
health,
host,
] = await Promise.all([
getJobStats(arcadia).catch(() => null),
getRecentJobs(arcadia, { limit: 50 }).catch(() => []),
@@ -132,6 +147,8 @@ export default function MonitoringRoute() {
getAuditStats(arcadia, {
from: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString(),
}).catch(() => null),
getHealth(arcadia).catch(() => null),
getHostStats(arcadia).catch(() => null),
])
setData({
jobStats,
@@ -142,6 +159,8 @@ export default function MonitoringRoute() {
spaces,
droplets,
auditStats,
health,
host,
})
} catch (err) {
setError(err instanceof ArcadiaError ? err.message : "Failed to load monitoring data.")
@@ -241,7 +260,13 @@ export default function MonitoringRoute() {
<CardHeader className="flex flex-row items-center justify-between">
<div>
<CardTitle>Service health</CardTitle>
<CardDescription>Derived from live signals on each subsystem.</CardDescription>
<CardDescription>
{data.health
? `Live probes from /api/v1/health · checked ${new Date(
data.health.checked_at,
).toLocaleTimeString()}`
: "Live probes from /api/v1/health (unavailable — backend may be down or older than the per-subsystem probe rollout)."}
</CardDescription>
</div>
<OverallStatus components={components} />
</CardHeader>
@@ -282,8 +307,72 @@ export default function MonitoringRoute() {
/>
</div>
<Tabs defaultValue="jobs">
{data.host ? (
<div className="grid grid-cols-2 gap-3 md:grid-cols-4">
<KpiTile
label="CPU usage"
value={
data.host.cpu.util_pct != null
? formatPercent(data.host.cpu.util_pct / 100)
: "—"
}
icon={<Cpu className="size-4" />}
tone={
(data.host.cpu.util_pct ?? 0) > 90
? "negative"
: (data.host.cpu.util_pct ?? 0) > 70
? "warning"
: "neutral"
}
/>
<KpiTile
label="Load avg (1m)"
value={
data.host.cpu.load_avg_1 != null
? data.host.cpu.load_avg_1.toFixed(2)
: "—"
}
icon={<Activity className="size-4" />}
tone={
data.host.cpu.load_avg_1 != null &&
data.host.cpu.num_cpus &&
data.host.cpu.load_avg_1 > data.host.cpu.num_cpus
? "warning"
: "neutral"
}
/>
<KpiTile
label="Memory used"
value={memoryUsedLabel(data.host.memory)}
icon={<HardDrive className="size-4" />}
tone={
memoryUsedPct(data.host.memory) > 90
? "negative"
: memoryUsedPct(data.host.memory) > 75
? "warning"
: "neutral"
}
/>
<KpiTile
label="Disk (busiest mount)"
value={busiestDiskLabel(data.host.disks)}
icon={<Database className="size-4" />}
tone={
busiestDiskPct(data.host.disks) > 90
? "negative"
: busiestDiskPct(data.host.disks) > 75
? "warning"
: "neutral"
}
/>
</div>
) : null}
<Tabs defaultValue="host">
<TabsList>
<TabsTrigger value="host" data-action="monitoring-tab-host">
Host
</TabsTrigger>
<TabsTrigger value="jobs" data-action="monitoring-tab-jobs">
Background jobs
</TabsTrigger>
@@ -301,6 +390,10 @@ export default function MonitoringRoute() {
</TabsTrigger>
</TabsList>
<TabsContent value="host" className="pt-4">
<HostPanel host={data.host} />
</TabsContent>
<TabsContent value="jobs" className="pt-4">
<JobsPanel
stats={data.jobStats}
@@ -343,47 +436,292 @@ export default function MonitoringRoute() {
)
}
// Synthesize a status board from the live signals we have.
// Map arcadia /health probe results onto the status-ui component model.
// "ok" → operational, "degraded" → degraded, "error" → partial-outage,
// "unconfigured" → operational (storage with no configured backend is ok).
function buildStatusComponents(d: DashboardData): StatusComponent[] {
const apiOk = d.rateLimits.length > 0
const dbOk = d.sessions !== null
const workersState: ComponentState = (() => {
if (!d.jobStats) return "partial-outage"
const r = d.jobStats.counts.retryable ?? 0
const x = d.jobStats.counts.discarded ?? 0
if (x > 100) return "major-outage"
if (r > 50 || x > 0) return "degraded"
return "operational"
})()
const storageState: ComponentState =
d.spaces.length > 0 || d.infraSummary ? "operational" : "partial-outage"
return [
{
id: "api",
name: "API",
description: "/api/v1 — auth, REST endpoints",
state: apiOk ? "operational" : "partial-outage",
},
{
id: "db",
name: "Database",
description: "Postgres — sessions, audit log",
state: dbOk ? "operational" : "partial-outage",
},
{
id: "workers",
const subsystems = d.health?.subsystems
const meta: Record<HealthSubsystem, { name: string; description: string }> = {
api: { name: "API", description: "/api/v1 — auth, REST endpoints" },
db: { name: "Database", description: "Postgres — sessions, audit log" },
workers: {
name: "Background workers",
description: "Oban — webhook delivery, scheduled tasks",
state: workersState,
},
{
id: "storage",
storage: {
name: "Storage",
description: "DigitalOcean Spaces / S3-compatible object storage",
state: storageState,
description: "S3-compatible object storage (per platform default)",
},
]
}
return SUBSYSTEMS.map((id) => {
const probe = subsystems?.[id]
return {
id,
name: meta[id].name,
description: probe?.message ?? meta[id].description,
state: probe ? mapHealthState(probe) : "partial-outage",
} satisfies StatusComponent
})
}
function mapHealthState(probe: SubsystemHealth): ComponentState {
switch (probe.status) {
case "ok":
case "unconfigured":
return "operational"
case "degraded":
return "degraded"
case "error":
return "major-outage"
default:
return "partial-outage"
}
}
// --- Host panel --------------------------------------------------------
function HostPanel({ host }: { host: HostStats | null }) {
if (!host) {
return (
<PanelStub
icon={<Cpu className="size-5" />}
text="Host stats unavailable. The /api/v1/health/host endpoint may not be deployed yet, or os_mon daemons aren't reachable."
/>
)
}
const memUsed = memoryUsedBytes(host.memory)
const memTotal = host.memory.total_bytes ?? null
const memPct = memoryUsedPct(host.memory)
const swapTotal = host.memory.swap_total_bytes ?? null
const swapUsed =
swapTotal != null && host.memory.swap_free_bytes != null
? swapTotal - host.memory.swap_free_bytes
: null
return (
<div className="flex flex-col gap-4">
{/* CPU + load */}
<div className="grid grid-cols-1 gap-3 lg:grid-cols-2">
<Card>
<CardHeader>
<CardTitle className="text-base">CPU</CardTitle>
<CardDescription>
{host.cpu.num_cpus
? `${host.cpu.num_cpus} cores · ${host.cpu.schedulers_online} BEAM schedulers online`
: `${host.cpu.schedulers_online} BEAM schedulers online`}
</CardDescription>
</CardHeader>
<CardContent className="flex flex-col gap-3">
<UsageBar
label="Overall utilisation"
pct={host.cpu.util_pct ?? null}
valueText={
host.cpu.util_pct != null ? `${host.cpu.util_pct.toFixed(1)}%` : "—"
}
/>
{host.cpu.per_cpu_pct.length > 0 ? (
<div className="flex flex-col gap-1">
<span className="text-xs text-muted-foreground">Per core</span>
<div className="flex flex-wrap gap-1.5">
{host.cpu.per_cpu_pct.map((p, i) => (
<div
key={i}
className="flex h-6 w-12 items-center justify-center rounded text-[11px] font-mono"
style={{
background: `linear-gradient(to right, var(--primary) ${p}%, var(--muted) ${p}%)`,
color: p > 50 ? "var(--primary-foreground)" : "var(--foreground)",
}}
title={`Core ${i}: ${p.toFixed(1)}%`}
>
{p.toFixed(0)}
</div>
))}
</div>
</div>
) : null}
</CardContent>
</Card>
<Card>
<CardHeader>
<CardTitle className="text-base">Load average</CardTitle>
<CardDescription>
Unix-style load average. A value above the core count means the
run-queue is saturated.
</CardDescription>
</CardHeader>
<CardContent>
<div className="grid grid-cols-3 gap-3">
<LoadAvgCell label="1 min" value={host.cpu.load_avg_1} cores={host.cpu.num_cpus} />
<LoadAvgCell label="5 min" value={host.cpu.load_avg_5} cores={host.cpu.num_cpus} />
<LoadAvgCell label="15 min" value={host.cpu.load_avg_15} cores={host.cpu.num_cpus} />
</div>
</CardContent>
</Card>
</div>
{/* Memory */}
<Card>
<CardHeader>
<CardTitle className="text-base">Memory</CardTitle>
<CardDescription>
{memTotal != null ? `${formatBytes(memTotal)} total` : "Total memory unknown"}
{host.memory.available_bytes != null
? ` · ${formatBytes(host.memory.available_bytes)} available`
: ""}
</CardDescription>
</CardHeader>
<CardContent className="flex flex-col gap-3">
<UsageBar
label="Used"
pct={memPct}
valueText={
memUsed != null && memTotal != null
? `${formatBytes(memUsed)} / ${formatBytes(memTotal)} (${memPct.toFixed(1)}%)`
: "—"
}
/>
{(host.memory.buffered_bytes != null || host.memory.cached_bytes != null) && (
<div className="grid grid-cols-2 gap-2 text-xs text-muted-foreground">
{host.memory.buffered_bytes != null && (
<span>Buffered: {formatBytes(host.memory.buffered_bytes)}</span>
)}
{host.memory.cached_bytes != null && (
<span>Cached: {formatBytes(host.memory.cached_bytes)}</span>
)}
</div>
)}
{swapTotal != null && swapTotal > 0 ? (
<UsageBar
label="Swap"
pct={swapUsed != null ? (swapUsed / swapTotal) * 100 : null}
valueText={
swapUsed != null
? `${formatBytes(swapUsed)} / ${formatBytes(swapTotal)}`
: "—"
}
/>
) : null}
</CardContent>
</Card>
{/* Disks */}
<Card>
<CardHeader>
<CardTitle className="text-base">Disks</CardTitle>
<CardDescription>One row per mount point.</CardDescription>
</CardHeader>
<CardContent>
{host.disks.length === 0 ? (
<p className="py-4 text-sm text-muted-foreground">No disks reported.</p>
) : (
<div className="flex flex-col gap-2">
{host.disks.map((d) => (
<UsageBar
key={d.mount}
label={d.mount}
pct={d.used_pct}
valueText={`${d.used_pct}% of ${formatBytes(d.total_kb * 1024)}`}
/>
))}
</div>
)}
</CardContent>
</Card>
</div>
)
}
function UsageBar({
label,
pct,
valueText,
}: {
label: string
pct: number | null
valueText: string
}) {
const clamped = pct == null ? 0 : Math.max(0, Math.min(100, pct))
const tone = pct == null ? "var(--muted-foreground)" : barColor(pct)
return (
<div className="flex flex-col gap-1">
<div className="flex items-baseline justify-between gap-2 text-xs">
<span className="font-medium">{label}</span>
<span className="font-mono text-muted-foreground">{valueText}</span>
</div>
<div className="h-2 w-full overflow-hidden rounded-full bg-muted">
<div
className="h-full rounded-full transition-[width] duration-500"
style={{ width: `${clamped}%`, background: tone }}
/>
</div>
</div>
)
}
function LoadAvgCell({
label,
value,
cores,
}: {
label: string
value: number | null
cores: number | null
}) {
const saturated = value != null && cores != null && value > cores
return (
<div className="flex flex-col items-start gap-0.5 rounded-md border bg-card p-3">
<span className="text-xs text-muted-foreground">{label}</span>
<span
className="font-mono text-2xl font-semibold tabular-nums"
style={{ color: saturated ? "var(--destructive)" : "var(--foreground)" }}
>
{value != null ? value.toFixed(2) : "—"}
</span>
{cores ? (
<span className="text-[11px] text-muted-foreground">/ {cores} cores</span>
) : null}
</div>
)
}
function memoryUsedBytes(m: HostStats["memory"]): number | null {
if (m.total_bytes == null) return null
// Prefer "available" over "free" — on Linux, free excludes reclaimable
// buffer/cache memory and overstates pressure.
const available = m.available_bytes ?? m.free_bytes
if (available == null) return null
return Math.max(0, m.total_bytes - available)
}
function memoryUsedPct(m: HostStats["memory"]): number {
const used = memoryUsedBytes(m)
if (used == null || m.total_bytes == null || m.total_bytes === 0) return 0
return (used / m.total_bytes) * 100
}
function memoryUsedLabel(m: HostStats["memory"]): string {
const used = memoryUsedBytes(m)
if (used == null || m.total_bytes == null) return "—"
return `${formatBytes(used)} / ${formatBytes(m.total_bytes)}`
}
function busiestDiskPct(disks: HostStats["disks"]): number {
return disks.reduce((m, d) => Math.max(m, d.used_pct), 0)
}
function busiestDiskLabel(disks: HostStats["disks"]): string {
if (disks.length === 0) return "—"
const busiest = disks.reduce((a, b) => (b.used_pct > a.used_pct ? b : a))
return `${busiest.used_pct}% (${busiest.mount})`
}
function barColor(pct: number): string {
if (pct >= 90) return "var(--destructive)"
if (pct >= 75) return "#f59e0b"
return "var(--primary)"
}
// --- Jobs panel --------------------------------------------------------