From 29030c9e72ba282dc3a0fa4f24bfc991e2617870 Mon Sep 17 00:00:00 2001 From: jules Date: Sat, 2 May 2026 17:05:22 +1000 Subject: [PATCH] Wire health probes, host stats, and LLM proxy round-trip Three things from the latest arcadia-app pull: - health.ts: client for /api/v1/health{,/:service,/detailed,/host}. monitoring.tsx now reads real per-subsystem probe state instead of synthesizing it from indirect signals (rate limits, sessions, jobs). - New Host tab on Monitoring with KPI tiles + per-core CPU bars, load-avg cards, memory + swap usage, and per-mount disk bars, backed by /api/v1/health/host. - llm-proxy.ts: typed errors (secret_disabled, ip_not_allowed, etc.) and a probeProxy() that round-trips a 1-token chat. settings.tsx's "Test connection" in proxy mode now exercises the real endpoint instead of just confirming the adapter built. Contract doc flipped from "not yet implemented" to "implemented". Co-Authored-By: Claude Opus 4.7 (1M context) --- app/lib/arcadia/health.ts | 94 ++++++++ app/lib/arcadia/llm-proxy.ts | 182 ++++++++++++++++ app/routes/monitoring.tsx | 412 +++++++++++++++++++++++++++++++---- app/routes/settings.tsx | 17 +- docs/LLM_PROXY_CONTRACT.md | 2 +- 5 files changed, 661 insertions(+), 46 deletions(-) create mode 100644 app/lib/arcadia/health.ts create mode 100644 app/lib/arcadia/llm-proxy.ts diff --git a/app/lib/arcadia/health.ts b/app/lib/arcadia/health.ts new file mode 100644 index 0000000..307b6de --- /dev/null +++ b/app/lib/arcadia/health.ts @@ -0,0 +1,94 @@ +// Arcadia health probes. +// +// Backed by /api/v1/health* (public — no auth). Each subsystem is probed +// independently; the overall endpoint aggregates and returns 503 if any +// subsystem is not "ok". See arcadia-app commit f427892. + +import type { ArcadiaClient } from "@crema/arcadia-client" + +export type HealthSubsystem = "api" | "db" | "workers" | "storage" + +export type HealthStatus = "ok" | "degraded" | "error" | "unconfigured" + +export interface SubsystemHealth { + status: HealthStatus + /** Optional human-readable detail. */ + message?: string + /** Free-form metrics — shape is subsystem-specific. */ + details?: Record +} + +export interface OverallHealth { + status: HealthStatus + checked_at: string + subsystems: Record +} + +export interface DetailedHealth extends OverallHealth { + /** BEAM info — present on /health/detailed only. */ + system?: { + otp_release?: string + elixir_version?: string + process_count?: number + memory_total_bytes?: number + [k: string]: unknown + } +} + +export interface HostStats { + cpu: { + util_pct: number | null + per_cpu_pct: number[] + load_avg_1: number | null + load_avg_5: number | null + load_avg_15: number | null + schedulers_online: number + num_cpus: number | null + } + memory: { + total_bytes: number | null + free_bytes: number | null + available_bytes: number | null + buffered_bytes: number | null + cached_bytes: number | null + swap_total_bytes: number | null + swap_free_bytes: number | null + } + disks: Array<{ mount: string; total_kb: number; used_pct: number }> + checked_at: string +} + +const BASE = "/api/v1/health" + +export async function getHealth(arcadia: ArcadiaClient): Promise { + const res = await arcadia.GET<{ data: OverallHealth } | OverallHealth>(BASE) + return unwrap(res) +} + +export async function getServiceHealth( + arcadia: ArcadiaClient, + service: HealthSubsystem, +): Promise { + const res = await arcadia.GET<{ data: SubsystemHealth } | SubsystemHealth>( + `${BASE}/${service}`, + ) + return unwrap(res) +} + +export async function getHealthDetailed(arcadia: ArcadiaClient): Promise { + const res = await arcadia.GET<{ data: DetailedHealth } | DetailedHealth>(`${BASE}/detailed`) + return unwrap(res) +} + +export async function getHostStats(arcadia: ArcadiaClient): Promise { + const res = await arcadia.GET<{ data: HostStats } | HostStats>(`${BASE}/host`) + return unwrap(res) +} + +export const SUBSYSTEMS: HealthSubsystem[] = ["api", "db", "workers", "storage"] + +function unwrap(res: { data: T } | T): T { + return res && typeof res === "object" && "data" in (res as object) + ? (res as { data: T }).data + : (res as T) +} diff --git a/app/lib/arcadia/llm-proxy.ts b/app/lib/arcadia/llm-proxy.ts new file mode 100644 index 0000000..f202c2f --- /dev/null +++ b/app/lib/arcadia/llm-proxy.ts @@ -0,0 +1,182 @@ +// Arcadia LLM proxy client. +// +// Implements the spec in docs/LLM_PROXY_CONTRACT.md against arcadia-app's +// POST /api/v1/ai/llm/chat. The lib (@crema/llm-providers-ui buildAdapter) +// owns the streaming chat path itself; this module exposes a lightweight +// non-streaming probe so the Settings "Test connection" button can verify +// the proxy round-trips end-to-end (auth → secret resolution → upstream +// dispatch → response shape). + +import type { ArcadiaClient } from "@crema/arcadia-client" + +export type LLMProxyProvider = + | "openai" + | "anthropic" + | "deepseek" + | "qwen" + | "lmstudio" + +export type LLMProxyErrorCode = + | "unauthorized" + | "secret_disabled" + | "secret_expired" + | "secret_consumed" + | "ip_not_allowed" + | "unknown_provider" + | "upstream_unavailable" + | "rate_limited" + | "unknown" + +export interface LLMProxyChatRequest { + provider: LLMProxyProvider + /** Required for every provider except `lmstudio`. */ + secret_name?: string + model: string + messages: Array<{ role: "system" | "user" | "assistant"; content: string }> + stream?: boolean + max_tokens?: number + temperature?: number +} + +export interface LLMProxyChatResponse { + id: string + object: "chat.completion" + created: number + model: string + choices: Array<{ + index: number + finish_reason: string | null + message: { role: "assistant"; content: string; tool_calls: unknown } + }> + usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number } +} + +export class LLMProxyError extends Error { + readonly code: LLMProxyErrorCode + readonly status: number + readonly retryAfter?: number + + constructor(code: LLMProxyErrorCode, message: string, status: number, retryAfter?: number) { + super(message) + this.name = "LLMProxyError" + this.code = code + this.status = status + this.retryAfter = retryAfter + } +} + +/** + * Non-streaming chat completion via the proxy. The streaming path is owned + * by @crema/llm-providers-ui's buildAdapter; use this for probes and + * one-shot calls where SSE is overkill. + */ +export async function chat( + arcadia: ArcadiaClient, + req: LLMProxyChatRequest, +): Promise { + try { + const res = await arcadia.POST( + "/api/v1/ai/llm/chat", + { body: { ...req, stream: false } }, + ) + return res + } catch (e) { + throw asProxyError(e) + } +} + +/** + * Cheap end-to-end probe for the Settings "Test connection" flow in proxy + * mode. Sends a 1-token "ping" and reports whether the proxy is wired, + * the secret resolves, and the upstream answered. Intentionally tolerant + * of token-budget rejections — those still prove the round-trip works. + */ +export async function probeProxy( + arcadia: ArcadiaClient, + opts: { provider: LLMProxyProvider; model: string; secretName?: string }, +): Promise<{ ok: boolean; message: string }> { + try { + const res = await chat(arcadia, { + provider: opts.provider, + secret_name: opts.secretName, + model: opts.model, + messages: [{ role: "user", content: "ping" }], + max_tokens: 1, + stream: false, + }) + const used = res.usage?.total_tokens + return { + ok: true, + message: `Proxy OK — ${res.model}${used != null ? ` · ${used} tokens` : ""}.`, + } + } catch (e) { + if (e instanceof LLMProxyError) { + return { ok: false, message: friendly(e) } + } + return { ok: false, message: e instanceof Error ? e.message : String(e) } + } +} + +function asProxyError(e: unknown): LLMProxyError { + // ArcadiaClient throws ArcadiaError with a wrapped { error: { code, message } } + // body and HTTP status. Best-effort destructure without coupling to the + // class shape (it lives in a sibling lib). + if (e && typeof e === "object") { + const anyE = e as { + status?: number + code?: string + message?: string + body?: { error?: { code?: string; message?: string } } + headers?: Headers | Record + } + const status = anyE.status ?? 0 + const code = (anyE.body?.error?.code ?? anyE.code) as LLMProxyErrorCode | undefined + const message = anyE.body?.error?.message ?? anyE.message ?? "Proxy request failed." + const retryAfter = readRetryAfter(anyE.headers) + return new LLMProxyError(code ?? inferCodeFromStatus(status), message, status, retryAfter) + } + return new LLMProxyError("unknown", String(e), 0) +} + +function inferCodeFromStatus(status: number): LLMProxyErrorCode { + if (status === 401) return "unauthorized" + if (status === 403) return "ip_not_allowed" + if (status === 404) return "unknown_provider" + if (status === 410) return "secret_expired" + if (status === 429) return "rate_limited" + if (status === 502 || status === 503 || status === 504) return "upstream_unavailable" + return "unknown" +} + +function readRetryAfter(h: Headers | Record | undefined): number | undefined { + if (!h) return undefined + const raw = h instanceof Headers ? h.get("retry-after") : h["retry-after"] ?? h["Retry-After"] + if (!raw) return undefined + const n = Number(raw) + return Number.isFinite(n) ? n : undefined +} + +export function friendly(err: LLMProxyError): string { + switch (err.code) { + case "unauthorized": + return "Sign in expired — refresh and try again." + case "secret_disabled": + return "The vault secret is disabled. Re-enable it under /secrets." + case "secret_expired": + return "The vault secret has expired. Rotate it under /secrets." + case "secret_consumed": + return "Read-once secret already used. Rotate it under /secrets." + case "ip_not_allowed": + return "This client's IP is blocked by the secret's allowlist." + case "unknown_provider": + return "The proxy doesn't recognise this provider. Check the provider id." + case "upstream_unavailable": + return "The upstream LLM provider returned an error or timed out." + case "rate_limited": + return err.retryAfter + ? `Rate limited. Retry in ${err.retryAfter}s.` + : "Rate limited — slow down and try again." + default: + return err.message + } +} diff --git a/app/routes/monitoring.tsx b/app/routes/monitoring.tsx index 1457905..7465080 100644 --- a/app/routes/monitoring.tsx +++ b/app/routes/monitoring.tsx @@ -71,6 +71,15 @@ import { type RateLimit, type Space, } from "~/lib/arcadia/monitoring" +import { + getHealth, + getHostStats, + SUBSYSTEMS, + type HealthSubsystem, + type HostStats, + type OverallHealth, + type SubsystemHealth, +} from "~/lib/arcadia/health" import { pageTitle } from "~/lib/page-meta" import { useSession } from "~/lib/session" import { useRegisterAdminContext } from "~/lib/admin-context" @@ -86,6 +95,8 @@ interface DashboardData { spaces: Space[] droplets: Droplet[] auditStats: AuditStats | null + health: OverallHealth | null + host: HostStats | null } const EMPTY: DashboardData = { @@ -97,6 +108,8 @@ const EMPTY: DashboardData = { spaces: [], droplets: [], auditStats: null, + health: null, + host: null, } export default function MonitoringRoute() { @@ -121,6 +134,8 @@ export default function MonitoringRoute() { spaces, droplets, auditStats, + health, + host, ] = await Promise.all([ getJobStats(arcadia).catch(() => null), getRecentJobs(arcadia, { limit: 50 }).catch(() => []), @@ -132,6 +147,8 @@ export default function MonitoringRoute() { getAuditStats(arcadia, { from: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString(), }).catch(() => null), + getHealth(arcadia).catch(() => null), + getHostStats(arcadia).catch(() => null), ]) setData({ jobStats, @@ -142,6 +159,8 @@ export default function MonitoringRoute() { spaces, droplets, auditStats, + health, + host, }) } catch (err) { setError(err instanceof ArcadiaError ? err.message : "Failed to load monitoring data.") @@ -241,7 +260,13 @@ export default function MonitoringRoute() {
Service health - Derived from live signals on each subsystem. + + {data.health + ? `Live probes from /api/v1/health · checked ${new Date( + data.health.checked_at, + ).toLocaleTimeString()}` + : "Live probes from /api/v1/health (unavailable — backend may be down or older than the per-subsystem probe rollout)."} +
@@ -282,8 +307,72 @@ export default function MonitoringRoute() { /> - + {data.host ? ( +
+ } + tone={ + (data.host.cpu.util_pct ?? 0) > 90 + ? "negative" + : (data.host.cpu.util_pct ?? 0) > 70 + ? "warning" + : "neutral" + } + /> + } + tone={ + data.host.cpu.load_avg_1 != null && + data.host.cpu.num_cpus && + data.host.cpu.load_avg_1 > data.host.cpu.num_cpus + ? "warning" + : "neutral" + } + /> + } + tone={ + memoryUsedPct(data.host.memory) > 90 + ? "negative" + : memoryUsedPct(data.host.memory) > 75 + ? "warning" + : "neutral" + } + /> + } + tone={ + busiestDiskPct(data.host.disks) > 90 + ? "negative" + : busiestDiskPct(data.host.disks) > 75 + ? "warning" + : "neutral" + } + /> +
+ ) : null} + + + + Host + Background jobs @@ -301,6 +390,10 @@ export default function MonitoringRoute() { + + + + 0 - const dbOk = d.sessions !== null - const workersState: ComponentState = (() => { - if (!d.jobStats) return "partial-outage" - const r = d.jobStats.counts.retryable ?? 0 - const x = d.jobStats.counts.discarded ?? 0 - if (x > 100) return "major-outage" - if (r > 50 || x > 0) return "degraded" - return "operational" - })() - const storageState: ComponentState = - d.spaces.length > 0 || d.infraSummary ? "operational" : "partial-outage" - - return [ - { - id: "api", - name: "API", - description: "/api/v1 — auth, REST endpoints", - state: apiOk ? "operational" : "partial-outage", - }, - { - id: "db", - name: "Database", - description: "Postgres — sessions, audit log", - state: dbOk ? "operational" : "partial-outage", - }, - { - id: "workers", + const subsystems = d.health?.subsystems + const meta: Record = { + api: { name: "API", description: "/api/v1 — auth, REST endpoints" }, + db: { name: "Database", description: "Postgres — sessions, audit log" }, + workers: { name: "Background workers", description: "Oban — webhook delivery, scheduled tasks", - state: workersState, }, - { - id: "storage", + storage: { name: "Storage", - description: "DigitalOcean Spaces / S3-compatible object storage", - state: storageState, + description: "S3-compatible object storage (per platform default)", }, - ] + } + + return SUBSYSTEMS.map((id) => { + const probe = subsystems?.[id] + return { + id, + name: meta[id].name, + description: probe?.message ?? meta[id].description, + state: probe ? mapHealthState(probe) : "partial-outage", + } satisfies StatusComponent + }) +} + +function mapHealthState(probe: SubsystemHealth): ComponentState { + switch (probe.status) { + case "ok": + case "unconfigured": + return "operational" + case "degraded": + return "degraded" + case "error": + return "major-outage" + default: + return "partial-outage" + } +} + +// --- Host panel -------------------------------------------------------- + +function HostPanel({ host }: { host: HostStats | null }) { + if (!host) { + return ( + } + text="Host stats unavailable. The /api/v1/health/host endpoint may not be deployed yet, or os_mon daemons aren't reachable." + /> + ) + } + + const memUsed = memoryUsedBytes(host.memory) + const memTotal = host.memory.total_bytes ?? null + const memPct = memoryUsedPct(host.memory) + const swapTotal = host.memory.swap_total_bytes ?? null + const swapUsed = + swapTotal != null && host.memory.swap_free_bytes != null + ? swapTotal - host.memory.swap_free_bytes + : null + + return ( +
+ {/* CPU + load */} +
+ + + CPU + + {host.cpu.num_cpus + ? `${host.cpu.num_cpus} cores · ${host.cpu.schedulers_online} BEAM schedulers online` + : `${host.cpu.schedulers_online} BEAM schedulers online`} + + + + + {host.cpu.per_cpu_pct.length > 0 ? ( +
+ Per core +
+ {host.cpu.per_cpu_pct.map((p, i) => ( +
50 ? "var(--primary-foreground)" : "var(--foreground)", + }} + title={`Core ${i}: ${p.toFixed(1)}%`} + > + {p.toFixed(0)} +
+ ))} +
+
+ ) : null} +
+
+ + + + Load average + + Unix-style load average. A value above the core count means the + run-queue is saturated. + + + +
+ + + +
+
+
+
+ + {/* Memory */} + + + Memory + + {memTotal != null ? `${formatBytes(memTotal)} total` : "Total memory unknown"} + {host.memory.available_bytes != null + ? ` · ${formatBytes(host.memory.available_bytes)} available` + : ""} + + + + + {(host.memory.buffered_bytes != null || host.memory.cached_bytes != null) && ( +
+ {host.memory.buffered_bytes != null && ( + Buffered: {formatBytes(host.memory.buffered_bytes)} + )} + {host.memory.cached_bytes != null && ( + Cached: {formatBytes(host.memory.cached_bytes)} + )} +
+ )} + {swapTotal != null && swapTotal > 0 ? ( + + ) : null} +
+
+ + {/* Disks */} + + + Disks + One row per mount point. + + + {host.disks.length === 0 ? ( +

No disks reported.

+ ) : ( +
+ {host.disks.map((d) => ( + + ))} +
+ )} +
+
+
+ ) +} + +function UsageBar({ + label, + pct, + valueText, +}: { + label: string + pct: number | null + valueText: string +}) { + const clamped = pct == null ? 0 : Math.max(0, Math.min(100, pct)) + const tone = pct == null ? "var(--muted-foreground)" : barColor(pct) + return ( +
+
+ {label} + {valueText} +
+
+
+
+
+ ) +} + +function LoadAvgCell({ + label, + value, + cores, +}: { + label: string + value: number | null + cores: number | null +}) { + const saturated = value != null && cores != null && value > cores + return ( +
+ {label} + + {value != null ? value.toFixed(2) : "—"} + + {cores ? ( + / {cores} cores + ) : null} +
+ ) +} + +function memoryUsedBytes(m: HostStats["memory"]): number | null { + if (m.total_bytes == null) return null + // Prefer "available" over "free" — on Linux, free excludes reclaimable + // buffer/cache memory and overstates pressure. + const available = m.available_bytes ?? m.free_bytes + if (available == null) return null + return Math.max(0, m.total_bytes - available) +} + +function memoryUsedPct(m: HostStats["memory"]): number { + const used = memoryUsedBytes(m) + if (used == null || m.total_bytes == null || m.total_bytes === 0) return 0 + return (used / m.total_bytes) * 100 +} + +function memoryUsedLabel(m: HostStats["memory"]): string { + const used = memoryUsedBytes(m) + if (used == null || m.total_bytes == null) return "—" + return `${formatBytes(used)} / ${formatBytes(m.total_bytes)}` +} + +function busiestDiskPct(disks: HostStats["disks"]): number { + return disks.reduce((m, d) => Math.max(m, d.used_pct), 0) +} + +function busiestDiskLabel(disks: HostStats["disks"]): string { + if (disks.length === 0) return "—" + const busiest = disks.reduce((a, b) => (b.used_pct > a.used_pct ? b : a)) + return `${busiest.used_pct}% (${busiest.mount})` +} + +function barColor(pct: number): string { + if (pct >= 90) return "var(--destructive)" + if (pct >= 75) return "#f59e0b" + return "var(--primary)" } // --- Jobs panel -------------------------------------------------------- diff --git a/app/routes/settings.tsx b/app/routes/settings.tsx index 256997b..bd9b8f8 100644 --- a/app/routes/settings.tsx +++ b/app/routes/settings.tsx @@ -18,6 +18,7 @@ import { } from "@crema/llm-providers-ui" import { useArcadiaClient } from "@crema/arcadia-client" +import { probeProxy, type LLMProxyProvider } from "~/lib/arcadia/llm-proxy" import { AppShell } from "~/components/layout/app-shell" import { Button } from "~/components/ui/button" import { @@ -98,15 +99,15 @@ export default function SettingsRoute() { arcadiaTenantId, }) - // In proxy mode the adapter just being built is the strongest signal we - // can get without actually firing a chat request — the proxy endpoint - // doesn't exist on the backend yet, so any /models probe would 404. + // Proxy mode: round-trip a 1-token chat to verify auth → secret + // resolution → upstream dispatch end-to-end. Maps the contract's + // specific error codes to user-facing messages. if (s.mode === "proxy") { - return { - ok: true, - message: - "Adapter built. Note: the backend proxy (/api/v1/ai/llm/chat) isn't deployed yet — see docs/LLM_PROXY_CONTRACT.md.", - } + return probeProxy(arcadia, { + provider: s.providerId as LLMProxyProvider, + model: s.model || (s.providerId === "anthropic" ? "claude-opus-4-7" : "gpt-4o-mini"), + secretName: s.secretName || undefined, + }) } // Direct mode — for OpenAI-compatible endpoints, /models is a cheap probe. diff --git a/docs/LLM_PROXY_CONTRACT.md b/docs/LLM_PROXY_CONTRACT.md index 5a0c9d0..6014eb1 100644 --- a/docs/LLM_PROXY_CONTRACT.md +++ b/docs/LLM_PROXY_CONTRACT.md @@ -1,6 +1,6 @@ # LLM Proxy Contract -> **Status: not yet implemented on the backend.** This document is the contract that `lib-llm-providers-ui` expects from arcadia. Implement `POST /api/v1/ai/llm/chat` server-side to make `mode: "proxy"` work in the client. +> **Status: implemented.** Backend lives in `arcadia-app` at `apps/arcadia_core/lib/arcadia/ai/llm_proxy*` (see commit `75669f1`). This document remains the contract that `lib-llm-providers-ui` and `app/lib/arcadia/llm-proxy.ts` expect from arcadia — keep it in sync if either side changes. ## Why a proxy?