From 29030c9e72ba282dc3a0fa4f24bfc991e2617870 Mon Sep 17 00:00:00 2001
From: jules <cloudtech@juleslive.net>
Date: Sat, 2 May 2026 17:05:22 +1000
Subject: [PATCH] Wire health probes, host stats, and LLM proxy round-trip

Three things from the latest arcadia-app pull:

- health.ts: client for /api/v1/health{,/:service,/detailed,/host}.
  monitoring.tsx now reads real per-subsystem probe state instead of
  synthesizing it from indirect signals (rate limits, sessions, jobs).
- New Host tab on Monitoring with KPI tiles + per-core CPU bars,
  load-avg cards, memory + swap usage, and per-mount disk bars,
  backed by /api/v1/health/host.
- llm-proxy.ts: typed errors (secret_disabled, ip_not_allowed, etc.)
  and a probeProxy() that round-trips a 1-token chat. settings.tsx's
  "Test connection" in proxy mode now exercises the real endpoint
  instead of just confirming the adapter built. Contract doc flipped
  from "not yet implemented" to "implemented".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/lib/arcadia/health.ts    |  94 ++++++++
 app/lib/arcadia/llm-proxy.ts | 182 ++++++++++++++++
 app/routes/monitoring.tsx    | 412 +++++++++++++++++++++++++++++++----
 app/routes/settings.tsx      |  17 +-
 docs/LLM_PROXY_CONTRACT.md   |   2 +-
 5 files changed, 661 insertions(+), 46 deletions(-)
 create mode 100644 app/lib/arcadia/health.ts
 create mode 100644 app/lib/arcadia/llm-proxy.ts
diff --git a/app/lib/arcadia/health.ts b/app/lib/arcadia/health.ts
new file mode 100644
index 0000000..307b6de
--- /dev/null
+++ b/app/lib/arcadia/health.ts
@@ -0,0 +1,94 @@
+// Arcadia health probes.
+//
+// Backed by /api/v1/health* (public — no auth). Each subsystem is probed
+// independently; the overall endpoint aggregates and returns 503 if any
+// subsystem is not "ok". See arcadia-app commit f427892.
+
+import type { ArcadiaClient } from "@crema/arcadia-client"
+
+export type HealthSubsystem = "api" | "db" | "workers" | "storage"
+
+export type HealthStatus = "ok" | "degraded" | "error" | "unconfigured"
+
+export interface SubsystemHealth {
+  status: HealthStatus
+  /** Optional human-readable detail. */
+  message?: string
+  /** Free-form metrics — shape is subsystem-specific. */
+  details?: Record<string, unknown>
+}
+
+export interface OverallHealth {
+  status: HealthStatus
+  checked_at: string
+  subsystems: Record<HealthSubsystem, SubsystemHealth>
+}
+
+export interface DetailedHealth extends OverallHealth {
+  /** BEAM info — present on /health/detailed only. */
+  system?: {
+    otp_release?: string
+    elixir_version?: string
+    process_count?: number
+    memory_total_bytes?: number
+    [k: string]: unknown
+  }
+}
+
+export interface HostStats {
+  cpu: {
+    util_pct: number | null
+    per_cpu_pct: number[]
+    load_avg_1: number | null
+    load_avg_5: number | null
+    load_avg_15: number | null
+    schedulers_online: number
+    num_cpus: number | null
+  }
+  memory: {
+    total_bytes: number | null
+    free_bytes: number | null
+    available_bytes: number | null
+    buffered_bytes: number | null
+    cached_bytes: number | null
+    swap_total_bytes: number | null
+    swap_free_bytes: number | null
+  }
+  disks: Array<{ mount: string; total_kb: number; used_pct: number }>
+  checked_at: string
+}
+
+const BASE = "/api/v1/health"
+
+export async function getHealth(arcadia: ArcadiaClient): Promise<OverallHealth> {
+  const res = await arcadia.GET<{ data: OverallHealth } | OverallHealth>(BASE)
+  return unwrap(res)
+}
+
+export async function getServiceHealth(
+  arcadia: ArcadiaClient,
+  service: HealthSubsystem,
+): Promise<SubsystemHealth> {
+  const res = await arcadia.GET<{ data: SubsystemHealth } | SubsystemHealth>(
+    `${BASE}/${service}`,
+  )
+  return unwrap(res)
+}
+
+export async function getHealthDetailed(arcadia: ArcadiaClient): Promise<DetailedHealth> {
+  const res = await arcadia.GET<{ data: DetailedHealth } | DetailedHealth>(`${BASE}/detailed`)
+  return unwrap(res)
+}
+
+export async function getHostStats(arcadia: ArcadiaClient): Promise<HostStats> {
+  const res = await arcadia.GET<{ data: HostStats } | HostStats>(`${BASE}/host`)
+  return unwrap(res)
+}
+
+export const SUBSYSTEMS: HealthSubsystem[] = ["api", "db", "workers", "storage"]
+
+function unwrap<T>(res: { data: T } | T): T {
+  return res && typeof res === "object" && "data" in (res as object)
+    ? (res as { data: T }).data
+    : (res as T)
+}
diff --git a/app/lib/arcadia/llm-proxy.ts b/app/lib/arcadia/llm-proxy.ts
new file mode 100644
index 0000000..f202c2f
--- /dev/null
+++ b/app/lib/arcadia/llm-proxy.ts
@@ -0,0 +1,182 @@
+// Arcadia LLM proxy client.
+//
+// Implements the spec in docs/LLM_PROXY_CONTRACT.md against arcadia-app's
+// POST /api/v1/ai/llm/chat. The lib (@crema/llm-providers-ui buildAdapter)
+// owns the streaming chat path itself; this module exposes a lightweight
+// non-streaming probe so the Settings "Test connection" button can verify
+// the proxy round-trips end-to-end (auth → secret resolution → upstream
+// dispatch → response shape).
+
+import type { ArcadiaClient } from "@crema/arcadia-client"
+
+export type LLMProxyProvider =
+  | "openai"
+  | "anthropic"
+  | "deepseek"
+  | "qwen"
+  | "lmstudio"
+
+export type LLMProxyErrorCode =
+  | "unauthorized"
+  | "secret_disabled"
+  | "secret_expired"
+  | "secret_consumed"
+  | "ip_not_allowed"
+  | "unknown_provider"
+  | "upstream_unavailable"
+  | "rate_limited"
+  | "unknown"
+
+export interface LLMProxyChatRequest {
+  provider: LLMProxyProvider
+  /** Required for every provider except `lmstudio`. */
+  secret_name?: string
+  model: string
+  messages: Array<{ role: "system" | "user" | "assistant"; content: string }>
+  stream?: boolean
+  max_tokens?: number
+  temperature?: number
+}
+
+export interface LLMProxyChatResponse {
+  id: string
+  object: "chat.completion"
+  created: number
+  model: string
+  choices: Array<{
+    index: number
+    finish_reason: string | null
+    message: { role: "assistant"; content: string; tool_calls: unknown }
+  }>
+  usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number }
+}
+
+export class LLMProxyError extends Error {
+  readonly code: LLMProxyErrorCode
+  readonly status: number
+  readonly retryAfter?: number
+
+  constructor(code: LLMProxyErrorCode, message: string, status: number, retryAfter?: number) {
+    super(message)
+    this.name = "LLMProxyError"
+    this.code = code
+    this.status = status
+    this.retryAfter = retryAfter
+  }
+}
+
+/**
+ * Non-streaming chat completion via the proxy. The streaming path is owned
+ * by @crema/llm-providers-ui's buildAdapter; use this for probes and
+ * one-shot calls where SSE is overkill.
+ */
+export async function chat(
+  arcadia: ArcadiaClient,
+  req: LLMProxyChatRequest,
+): Promise<LLMProxyChatResponse> {
+  try {
+    const res = await arcadia.POST<LLMProxyChatResponse>(
+      "/api/v1/ai/llm/chat",
+      { body: { ...req, stream: false } },
+    )
+    return res
+  } catch (e) {
+    throw asProxyError(e)
+  }
+}
+
+/**
+ * Cheap end-to-end probe for the Settings "Test connection" flow in proxy
+ * mode. Sends a 1-token "ping" and reports whether the proxy is wired,
+ * the secret resolves, and the upstream answered. Intentionally tolerant
+ * of token-budget rejections — those still prove the round-trip works.
+ */
+export async function probeProxy(
+  arcadia: ArcadiaClient,
+  opts: { provider: LLMProxyProvider; model: string; secretName?: string },
+): Promise<{ ok: boolean; message: string }> {
+  try {
+    const res = await chat(arcadia, {
+      provider: opts.provider,
+      secret_name: opts.secretName,
+      model: opts.model,
+      messages: [{ role: "user", content: "ping" }],
+      max_tokens: 1,
+      stream: false,
+    })
+    const used = res.usage?.total_tokens
+    return {
+      ok: true,
+      message: `Proxy OK — ${res.model}${used != null ? ` · ${used} tokens` : ""}.`,
+    }
+  } catch (e) {
+    if (e instanceof LLMProxyError) {
+      return { ok: false, message: friendly(e) }
+    }
+    return { ok: false, message: e instanceof Error ? e.message : String(e) }
+  }
+}
+
+function asProxyError(e: unknown): LLMProxyError {
+  // ArcadiaClient throws ArcadiaError with a wrapped { error: { code, message } }
+  // body and HTTP status. Best-effort destructure without coupling to the
+  // class shape (it lives in a sibling lib).
+  if (e && typeof e === "object") {
+    const anyE = e as {
+      status?: number
+      code?: string
+      message?: string
+      body?: { error?: { code?: string; message?: string } }
+      headers?: Headers | Record<string, string>
+    }
+    const status = anyE.status ?? 0
+    const code = (anyE.body?.error?.code ?? anyE.code) as LLMProxyErrorCode | undefined
+    const message = anyE.body?.error?.message ?? anyE.message ?? "Proxy request failed."
+    const retryAfter = readRetryAfter(anyE.headers)
+    return new LLMProxyError(code ?? inferCodeFromStatus(status), message, status, retryAfter)
+  }
+  return new LLMProxyError("unknown", String(e), 0)
+}
+
+function inferCodeFromStatus(status: number): LLMProxyErrorCode {
+  if (status === 401) return "unauthorized"
+  if (status === 403) return "ip_not_allowed"
+  if (status === 404) return "unknown_provider"
+  if (status === 410) return "secret_expired"
+  if (status === 429) return "rate_limited"
+  if (status === 502 || status === 503 || status === 504) return "upstream_unavailable"
+  return "unknown"
+}
+
+function readRetryAfter(h: Headers | Record<string, string> | undefined): number | undefined {
+  if (!h) return undefined
+  const raw = h instanceof Headers ? h.get("retry-after") : h["retry-after"] ?? h["Retry-After"]
+  if (!raw) return undefined
+  const n = Number(raw)
+  return Number.isFinite(n) ? n : undefined
+}
+
+export function friendly(err: LLMProxyError): string {
+  switch (err.code) {
+    case "unauthorized":
+      return "Sign in expired — refresh and try again."
+    case "secret_disabled":
+      return "The vault secret is disabled. Re-enable it under /secrets."
+    case "secret_expired":
+      return "The vault secret has expired. Rotate it under /secrets."
+    case "secret_consumed":
+      return "Read-once secret already used. Rotate it under /secrets."
+    case "ip_not_allowed":
+      return "This client's IP is blocked by the secret's allowlist."
+    case "unknown_provider":
+      return "The proxy doesn't recognise this provider. Check the provider id."
+    case "upstream_unavailable":
+      return "The upstream LLM provider returned an error or timed out."
+    case "rate_limited":
+      return err.retryAfter
+        ? `Rate limited. Retry in ${err.retryAfter}s.`
+        : "Rate limited — slow down and try again."
+    default:
+      return err.message
+  }
+}
diff --git a/app/routes/monitoring.tsx b/app/routes/monitoring.tsx
index 1457905..7465080 100644
--- a/app/routes/monitoring.tsx
+++ b/app/routes/monitoring.tsx
@@ -71,6 +71,15 @@ import {
   type RateLimit,
   type Space,
 } from "~/lib/arcadia/monitoring"
+import {
+  getHealth,
+  getHostStats,
+  SUBSYSTEMS,
+  type HealthSubsystem,
+  type HostStats,
+  type OverallHealth,
+  type SubsystemHealth,
+} from "~/lib/arcadia/health"
 import { pageTitle } from "~/lib/page-meta"
 import { useSession } from "~/lib/session"
 import { useRegisterAdminContext } from "~/lib/admin-context"
@@ -86,6 +95,8 @@ interface DashboardData {
   spaces: Space[]
   droplets: Droplet[]
   auditStats: AuditStats | null
+  health: OverallHealth | null
+  host: HostStats | null
 }
 
 const EMPTY: DashboardData = {
@@ -97,6 +108,8 @@ const EMPTY: DashboardData = {
   spaces: [],
   droplets: [],
   auditStats: null,
+  health: null,
+  host: null,
 }
 
 export default function MonitoringRoute() {
@@ -121,6 +134,8 @@ export default function MonitoringRoute() {
         spaces,
         droplets,
         auditStats,
+        health,
+        host,
       ] = await Promise.all([
         getJobStats(arcadia).catch(() => null),
         getRecentJobs(arcadia, { limit: 50 }).catch(() => []),
@@ -132,6 +147,8 @@ export default function MonitoringRoute() {
         getAuditStats(arcadia, {
           from: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString(),
         }).catch(() => null),
+        getHealth(arcadia).catch(() => null),
+        getHostStats(arcadia).catch(() => null),
       ])
       setData({
         jobStats,
@@ -142,6 +159,8 @@ export default function MonitoringRoute() {
         spaces,
         droplets,
         auditStats,
+        health,
+        host,
       })
     } catch (err) {
       setError(err instanceof ArcadiaError ? err.message : "Failed to load monitoring data.")
@@ -241,7 +260,13 @@ export default function MonitoringRoute() {
           <CardHeader className="flex flex-row items-center justify-between">
             <div>
               <CardTitle>Service health</CardTitle>
-              <CardDescription>Derived from live signals on each subsystem.</CardDescription>
+              <CardDescription>
+                {data.health
+                  ? `Live probes from /api/v1/health · checked ${new Date(
+                      data.health.checked_at,
+                    ).toLocaleTimeString()}`
+                  : "Live probes from /api/v1/health (unavailable — backend may be down or older than the per-subsystem probe rollout)."}
+              </CardDescription>
             </div>
             <OverallStatus components={components} />
           </CardHeader>
@@ -282,8 +307,72 @@ export default function MonitoringRoute() {
           />
         </div>
 
-        <Tabs defaultValue="jobs">
+        {data.host ? (
+          <div className="grid grid-cols-2 gap-3 md:grid-cols-4">
+            <KpiTile
+              label="CPU usage"
+              value={
+                data.host.cpu.util_pct != null
+                  ? formatPercent(data.host.cpu.util_pct / 100)
+                  : "—"
+              }
+              icon={<Cpu className="size-4" />}
+              tone={
+                (data.host.cpu.util_pct ?? 0) > 90
+                  ? "negative"
+                  : (data.host.cpu.util_pct ?? 0) > 70
+                    ? "warning"
+                    : "neutral"
+              }
+            />
+            <KpiTile
+              label="Load avg (1m)"
+              value={
+                data.host.cpu.load_avg_1 != null
+                  ? data.host.cpu.load_avg_1.toFixed(2)
+                  : "—"
+              }
+              icon={<Activity className="size-4" />}
+              tone={
+                data.host.cpu.load_avg_1 != null &&
+                data.host.cpu.num_cpus &&
+                data.host.cpu.load_avg_1 > data.host.cpu.num_cpus
+                  ? "warning"
+                  : "neutral"
+              }
+            />
+            <KpiTile
+              label="Memory used"
+              value={memoryUsedLabel(data.host.memory)}
+              icon={<HardDrive className="size-4" />}
+              tone={
+                memoryUsedPct(data.host.memory) > 90
+                  ? "negative"
+                  : memoryUsedPct(data.host.memory) > 75
+                    ? "warning"
+                    : "neutral"
+              }
+            />
+            <KpiTile
+              label="Disk (busiest mount)"
+              value={busiestDiskLabel(data.host.disks)}
+              icon={<Database className="size-4" />}
+              tone={
+                busiestDiskPct(data.host.disks) > 90
+                  ? "negative"
+                  : busiestDiskPct(data.host.disks) > 75
+                    ? "warning"
+                    : "neutral"
+              }
+            />
+          </div>
+        ) : null}
+
+        <Tabs defaultValue="host">
           <TabsList>
+            <TabsTrigger value="host" data-action="monitoring-tab-host">
+              Host
+            </TabsTrigger>
             <TabsTrigger value="jobs" data-action="monitoring-tab-jobs">
               Background jobs
             </TabsTrigger>
@@ -301,6 +390,10 @@ export default function MonitoringRoute() {
             </TabsTrigger>
           </TabsList>
 
+          <TabsContent value="host" className="pt-4">
+            <HostPanel host={data.host} />
+          </TabsContent>
+
           <TabsContent value="jobs" className="pt-4">
             <JobsPanel
               stats={data.jobStats}
@@ -343,47 +436,292 @@ export default function MonitoringRoute() {
   )
 }
 
-// Synthesize a status board from the live signals we have.
+// Map arcadia /health probe results onto the status-ui component model.
+// "ok" → operational, "degraded" → degraded, "error" → partial-outage,
+// "unconfigured" → operational (storage with no configured backend is ok).
 function buildStatusComponents(d: DashboardData): StatusComponent[] {
-  const apiOk = d.rateLimits.length > 0
-  const dbOk = d.sessions !== null
-  const workersState: ComponentState = (() => {
-    if (!d.jobStats) return "partial-outage"
-    const r = d.jobStats.counts.retryable ?? 0
-    const x = d.jobStats.counts.discarded ?? 0
-    if (x > 100) return "major-outage"
-    if (r > 50 || x > 0) return "degraded"
-    return "operational"
-  })()
-  const storageState: ComponentState =
-    d.spaces.length > 0 || d.infraSummary ? "operational" : "partial-outage"
-
-  return [
-    {
-      id: "api",
-      name: "API",
-      description: "/api/v1 — auth, REST endpoints",
-      state: apiOk ? "operational" : "partial-outage",
-    },
-    {
-      id: "db",
-      name: "Database",
-      description: "Postgres — sessions, audit log",
-      state: dbOk ? "operational" : "partial-outage",
-    },
-    {
-      id: "workers",
+  const subsystems = d.health?.subsystems
+  const meta: Record<HealthSubsystem, { name: string; description: string }> = {
+    api: { name: "API", description: "/api/v1 — auth, REST endpoints" },
+    db: { name: "Database", description: "Postgres — sessions, audit log" },
+    workers: {
       name: "Background workers",
       description: "Oban — webhook delivery, scheduled tasks",
-      state: workersState,
     },
-    {
-      id: "storage",
+    storage: {
       name: "Storage",
-      description: "DigitalOcean Spaces / S3-compatible object storage",
-      state: storageState,
+      description: "S3-compatible object storage (per platform default)",
     },
-  ]
+  }
+
+  return SUBSYSTEMS.map((id) => {
+    const probe = subsystems?.[id]
+    return {
+      id,
+      name: meta[id].name,
+      description: probe?.message ?? meta[id].description,
+      state: probe ? mapHealthState(probe) : "partial-outage",
+    } satisfies StatusComponent
+  })
+}
+
+function mapHealthState(probe: SubsystemHealth): ComponentState {
+  switch (probe.status) {
+    case "ok":
+    case "unconfigured":
+      return "operational"
+    case "degraded":
+      return "degraded"
+    case "error":
+      return "major-outage"
+    default:
+      return "partial-outage"
+  }
+}
+
+// --- Host panel --------------------------------------------------------
+
+function HostPanel({ host }: { host: HostStats | null }) {
+  if (!host) {
+    return (
+      <PanelStub
+        icon={<Cpu className="size-5" />}
+        text="Host stats unavailable. The /api/v1/health/host endpoint may not be deployed yet, or os_mon daemons aren't reachable."
+      />
+    )
+  }
+
+  const memUsed = memoryUsedBytes(host.memory)
+  const memTotal = host.memory.total_bytes ?? null
+  const memPct = memoryUsedPct(host.memory)
+  const swapTotal = host.memory.swap_total_bytes ?? null
+  const swapUsed =
+    swapTotal != null && host.memory.swap_free_bytes != null
+      ? swapTotal - host.memory.swap_free_bytes
+      : null
+
+  return (
+    <div className="flex flex-col gap-4">
+      {/* CPU + load */}
+      <div className="grid grid-cols-1 gap-3 lg:grid-cols-2">
+        <Card>
+          <CardHeader>
+            <CardTitle className="text-base">CPU</CardTitle>
+            <CardDescription>
+              {host.cpu.num_cpus
+                ? `${host.cpu.num_cpus} cores · ${host.cpu.schedulers_online} BEAM schedulers online`
+                : `${host.cpu.schedulers_online} BEAM schedulers online`}
+            </CardDescription>
+          </CardHeader>
+          <CardContent className="flex flex-col gap-3">
+            <UsageBar
+              label="Overall utilisation"
+              pct={host.cpu.util_pct ?? null}
+              valueText={
+                host.cpu.util_pct != null ? `${host.cpu.util_pct.toFixed(1)}%` : "—"
+              }
+            />
+            {host.cpu.per_cpu_pct.length > 0 ? (
+              <div className="flex flex-col gap-1">
+                <span className="text-xs text-muted-foreground">Per core</span>
+                <div className="flex flex-wrap gap-1.5">
+                  {host.cpu.per_cpu_pct.map((p, i) => (
+                    <div
+                      key={i}
+                      className="flex h-6 w-12 items-center justify-center rounded text-[11px] font-mono"
+                      style={{
+                        background: `linear-gradient(to right, var(--primary) ${p}%, var(--muted) ${p}%)`,
+                        color: p > 50 ? "var(--primary-foreground)" : "var(--foreground)",
+                      }}
+                      title={`Core ${i}: ${p.toFixed(1)}%`}
+                    >
+                      {p.toFixed(0)}
+                    </div>
+                  ))}
+                </div>
+              </div>
+            ) : null}
+          </CardContent>
+        </Card>
+
+        <Card>
+          <CardHeader>
+            <CardTitle className="text-base">Load average</CardTitle>
+            <CardDescription>
+              Unix-style load average. A value above the core count means the
+              run-queue is saturated.
+            </CardDescription>
+          </CardHeader>
+          <CardContent>
+            <div className="grid grid-cols-3 gap-3">
+              <LoadAvgCell label="1 min" value={host.cpu.load_avg_1} cores={host.cpu.num_cpus} />
+              <LoadAvgCell label="5 min" value={host.cpu.load_avg_5} cores={host.cpu.num_cpus} />
+              <LoadAvgCell label="15 min" value={host.cpu.load_avg_15} cores={host.cpu.num_cpus} />
+            </div>
+          </CardContent>
+        </Card>
+      </div>
+
+      {/* Memory */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-base">Memory</CardTitle>
+          <CardDescription>
+            {memTotal != null ? `${formatBytes(memTotal)} total` : "Total memory unknown"}
+            {host.memory.available_bytes != null
+              ? ` · ${formatBytes(host.memory.available_bytes)} available`
+              : ""}
+          </CardDescription>
+        </CardHeader>
+        <CardContent className="flex flex-col gap-3">
+          <UsageBar
+            label="Used"
+            pct={memPct}
+            valueText={
+              memUsed != null && memTotal != null
+                ? `${formatBytes(memUsed)} / ${formatBytes(memTotal)} (${memPct.toFixed(1)}%)`
+                : "—"
+            }
+          />
+          {(host.memory.buffered_bytes != null || host.memory.cached_bytes != null) && (
+            <div className="grid grid-cols-2 gap-2 text-xs text-muted-foreground">
+              {host.memory.buffered_bytes != null && (
+                <span>Buffered: {formatBytes(host.memory.buffered_bytes)}</span>
+              )}
+              {host.memory.cached_bytes != null && (
+                <span>Cached: {formatBytes(host.memory.cached_bytes)}</span>
+              )}
+            </div>
+          )}
+          {swapTotal != null && swapTotal > 0 ? (
+            <UsageBar
+              label="Swap"
+              pct={swapUsed != null ? (swapUsed / swapTotal) * 100 : null}
+              valueText={
+                swapUsed != null
+                  ? `${formatBytes(swapUsed)} / ${formatBytes(swapTotal)}`
+                  : "—"
+              }
+            />
+          ) : null}
+        </CardContent>
+      </Card>
+
+      {/* Disks */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-base">Disks</CardTitle>
+          <CardDescription>One row per mount point.</CardDescription>
+        </CardHeader>
+        <CardContent>
+          {host.disks.length === 0 ? (
+            <p className="py-4 text-sm text-muted-foreground">No disks reported.</p>
+          ) : (
+            <div className="flex flex-col gap-2">
+              {host.disks.map((d) => (
+                <UsageBar
+                  key={d.mount}
+                  label={d.mount}
+                  pct={d.used_pct}
+                  valueText={`${d.used_pct}% of ${formatBytes(d.total_kb * 1024)}`}
+                />
+              ))}
+            </div>
+          )}
+        </CardContent>
+      </Card>
+    </div>
+  )
+}
+
+function UsageBar({
+  label,
+  pct,
+  valueText,
+}: {
+  label: string
+  pct: number | null
+  valueText: string
+}) {
+  const clamped = pct == null ? 0 : Math.max(0, Math.min(100, pct))
+  const tone = pct == null ? "var(--muted-foreground)" : barColor(pct)
+  return (
+    <div className="flex flex-col gap-1">
+      <div className="flex items-baseline justify-between gap-2 text-xs">
+        <span className="font-medium">{label}</span>
+        <span className="font-mono text-muted-foreground">{valueText}</span>
+      </div>
+      <div className="h-2 w-full overflow-hidden rounded-full bg-muted">
+        <div
+          className="h-full rounded-full transition-[width] duration-500"
+          style={{ width: `${clamped}%`, background: tone }}
+        />
+      </div>
+    </div>
+  )
+}
+
+function LoadAvgCell({
+  label,
+  value,
+  cores,
+}: {
+  label: string
+  value: number | null
+  cores: number | null
+}) {
+  const saturated = value != null && cores != null && value > cores
+  return (
+    <div className="flex flex-col items-start gap-0.5 rounded-md border bg-card p-3">
+      <span className="text-xs text-muted-foreground">{label}</span>
+      <span
+        className="font-mono text-2xl font-semibold tabular-nums"
+        style={{ color: saturated ? "var(--destructive)" : "var(--foreground)" }}
+      >
+        {value != null ? value.toFixed(2) : "—"}
+      </span>
+      {cores ? (
+        <span className="text-[11px] text-muted-foreground">/ {cores} cores</span>
+      ) : null}
+    </div>
+  )
+}
+
+function memoryUsedBytes(m: HostStats["memory"]): number | null {
+  if (m.total_bytes == null) return null
+  // Prefer "available" over "free" — on Linux, free excludes reclaimable
+  // buffer/cache memory and overstates pressure.
+  const available = m.available_bytes ?? m.free_bytes
+  if (available == null) return null
+  return Math.max(0, m.total_bytes - available)
+}
+
+function memoryUsedPct(m: HostStats["memory"]): number {
+  const used = memoryUsedBytes(m)
+  if (used == null || m.total_bytes == null || m.total_bytes === 0) return 0
+  return (used / m.total_bytes) * 100
+}
+
+function memoryUsedLabel(m: HostStats["memory"]): string {
+  const used = memoryUsedBytes(m)
+  if (used == null || m.total_bytes == null) return "—"
+  return `${formatBytes(used)} / ${formatBytes(m.total_bytes)}`
+}
+
+function busiestDiskPct(disks: HostStats["disks"]): number {
+  return disks.reduce((m, d) => Math.max(m, d.used_pct), 0)
+}
+
+function busiestDiskLabel(disks: HostStats["disks"]): string {
+  if (disks.length === 0) return "—"
+  const busiest = disks.reduce((a, b) => (b.used_pct > a.used_pct ? b : a))
+  return `${busiest.used_pct}% (${busiest.mount})`
+}
+
+function barColor(pct: number): string {
+  if (pct >= 90) return "var(--destructive)"
+  if (pct >= 75) return "#f59e0b"
+  return "var(--primary)"
 }
 
 // --- Jobs panel --------------------------------------------------------
diff --git a/app/routes/settings.tsx b/app/routes/settings.tsx
index 256997b..bd9b8f8 100644
--- a/app/routes/settings.tsx
+++ b/app/routes/settings.tsx
@@ -18,6 +18,7 @@ import {
 } from "@crema/llm-providers-ui"
 import { useArcadiaClient } from "@crema/arcadia-client"
 
+import { probeProxy, type LLMProxyProvider } from "~/lib/arcadia/llm-proxy"
 import { AppShell } from "~/components/layout/app-shell"
 import { Button } from "~/components/ui/button"
 import {
@@ -98,15 +99,15 @@ export default function SettingsRoute() {
         arcadiaTenantId,
       })
 
-      // In proxy mode the adapter just being built is the strongest signal we
-      // can get without actually firing a chat request — the proxy endpoint
-      // doesn't exist on the backend yet, so any /models probe would 404.
+      // Proxy mode: round-trip a 1-token chat to verify auth → secret
+      // resolution → upstream dispatch end-to-end. Maps the contract's
+      // specific error codes to user-facing messages.
       if (s.mode === "proxy") {
-        return {
-          ok: true,
-          message:
-            "Adapter built. Note: the backend proxy (/api/v1/ai/llm/chat) isn't deployed yet — see docs/LLM_PROXY_CONTRACT.md.",
-        }
+        return probeProxy(arcadia, {
+          provider: s.providerId as LLMProxyProvider,
+          model: s.model || (s.providerId === "anthropic" ? "claude-opus-4-7" : "gpt-4o-mini"),
+          secretName: s.secretName || undefined,
+        })
       }
 
       // Direct mode — for OpenAI-compatible endpoints, /models is a cheap probe.
diff --git a/docs/LLM_PROXY_CONTRACT.md b/docs/LLM_PROXY_CONTRACT.md
index 5a0c9d0..6014eb1 100644
--- a/docs/LLM_PROXY_CONTRACT.md
+++ b/docs/LLM_PROXY_CONTRACT.md
@@ -1,6 +1,6 @@
 # LLM Proxy Contract
 
-> **Status: not yet implemented on the backend.** This document is the contract that `lib-llm-providers-ui` expects from arcadia. Implement `POST /api/v1/ai/llm/chat` server-side to make `mode: "proxy"` work in the client.
+> **Status: implemented.** Backend lives in `arcadia-app` at `apps/arcadia_core/lib/arcadia/ai/llm_proxy*` (see commit `75669f1`). This document remains the contract that `lib-llm-providers-ui` and `app/lib/arcadia/llm-proxy.ts` expect from arcadia — keep it in sync if either side changes.
 
 ## Why a proxy?