Wire health probes, host stats, and LLM proxy round-trip

Three things from the latest arcadia-app pull: - health.ts: client for /api/v1/health{,/:service,/detailed,/host}. monitoring.tsx now reads real per-subsystem probe state instead of synthesizing it from indirect signals (rate limits, sessions, jobs). - New Host tab on Monitoring with KPI tiles + per-core CPU bars, load-avg cards, memory + swap usage, and per-mount disk bars, backed by /api/v1/health/host. - llm-proxy.ts: typed errors (secret_disabled, ip_not_allowed, etc.) and a probeProxy() that round-trips a 1-token chat. settings.tsx's "Test connection" in proxy mode now exercises the real endpoint instead of just confirming the adapter built. Contract doc flipped from "not yet implemented" to "implemented". Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 17:05:22 +10:00
parent 0fcb9e40f1
commit 29030c9e72
5 changed files with 661 additions and 46 deletions
--- a/app/lib/arcadia/health.ts
+++ b/app/lib/arcadia/health.ts
@@ -0,0 +1,94 @@
+// Arcadia health probes.
+//
+// Backed by /api/v1/health* (public — no auth). Each subsystem is probed
+// independently; the overall endpoint aggregates and returns 503 if any
+// subsystem is not "ok". See arcadia-app commit f427892.
+
+import type { ArcadiaClient } from "@crema/arcadia-client"
+
+export type HealthSubsystem = "api" | "db" | "workers" | "storage"
+
+export type HealthStatus = "ok" | "degraded" | "error" | "unconfigured"
+
+export interface SubsystemHealth {
+  status: HealthStatus
+  /** Optional human-readable detail. */
+  message?: string
+  /** Free-form metrics — shape is subsystem-specific. */
+  details?: Record<string, unknown>
+}
+
+export interface OverallHealth {
+  status: HealthStatus
+  checked_at: string
+  subsystems: Record<HealthSubsystem, SubsystemHealth>
+}
+
+export interface DetailedHealth extends OverallHealth {
+  /** BEAM info — present on /health/detailed only. */
+  system?: {
+    otp_release?: string
+    elixir_version?: string
+    process_count?: number
+    memory_total_bytes?: number
+    [k: string]: unknown
+  }
+}
+
+export interface HostStats {
+  cpu: {
+    util_pct: number | null
+    per_cpu_pct: number[]
+    load_avg_1: number | null
+    load_avg_5: number | null
+    load_avg_15: number | null
+    schedulers_online: number
+    num_cpus: number | null
+  }
+  memory: {
+    total_bytes: number | null
+    free_bytes: number | null
+    available_bytes: number | null
+    buffered_bytes: number | null
+    cached_bytes: number | null
+    swap_total_bytes: number | null
+    swap_free_bytes: number | null
+  }
+  disks: Array<{ mount: string; total_kb: number; used_pct: number }>
+  checked_at: string
+}
+
+const BASE = "/api/v1/health"
+
+export async function getHealth(arcadia: ArcadiaClient): Promise<OverallHealth> {
+  const res = await arcadia.GET<{ data: OverallHealth } | OverallHealth>(BASE)
+  return unwrap(res)
+}
+
+export async function getServiceHealth(
+  arcadia: ArcadiaClient,
+  service: HealthSubsystem,
+): Promise<SubsystemHealth> {
+  const res = await arcadia.GET<{ data: SubsystemHealth } | SubsystemHealth>(
+    `${BASE}/${service}`,
+  )
+  return unwrap(res)
+}
+
+export async function getHealthDetailed(arcadia: ArcadiaClient): Promise<DetailedHealth> {
+  const res = await arcadia.GET<{ data: DetailedHealth } | DetailedHealth>(`${BASE}/detailed`)
+  return unwrap(res)
+}
+
+export async function getHostStats(arcadia: ArcadiaClient): Promise<HostStats> {
+  const res = await arcadia.GET<{ data: HostStats } | HostStats>(`${BASE}/host`)
+  return unwrap(res)
+}
+
+export const SUBSYSTEMS: HealthSubsystem[] = ["api", "db", "workers", "storage"]
+
+function unwrap<T>(res: { data: T } | T): T {
+  return res && typeof res === "object" && "data" in (res as object)
+    ? (res as { data: T }).data
+    : (res as T)
+}
--- a/app/lib/arcadia/llm-proxy.ts
+++ b/app/lib/arcadia/llm-proxy.ts
@@ -0,0 +1,182 @@
+// Arcadia LLM proxy client.
+//
+// Implements the spec in docs/LLM_PROXY_CONTRACT.md against arcadia-app's
+// POST /api/v1/ai/llm/chat. The lib (@crema/llm-providers-ui buildAdapter)
+// owns the streaming chat path itself; this module exposes a lightweight
+// non-streaming probe so the Settings "Test connection" button can verify
+// the proxy round-trips end-to-end (auth → secret resolution → upstream
+// dispatch → response shape).
+
+import type { ArcadiaClient } from "@crema/arcadia-client"
+
+export type LLMProxyProvider =
+  | "openai"
+  | "anthropic"
+  | "deepseek"
+  | "qwen"
+  | "lmstudio"
+
+export type LLMProxyErrorCode =
+  | "unauthorized"
+  | "secret_disabled"
+  | "secret_expired"
+  | "secret_consumed"
+  | "ip_not_allowed"
+  | "unknown_provider"
+  | "upstream_unavailable"
+  | "rate_limited"
+  | "unknown"
+
+export interface LLMProxyChatRequest {
+  provider: LLMProxyProvider
+  /** Required for every provider except `lmstudio`. */
+  secret_name?: string
+  model: string
+  messages: Array<{ role: "system" | "user" | "assistant"; content: string }>
+  stream?: boolean
+  max_tokens?: number
+  temperature?: number
+}
+
+export interface LLMProxyChatResponse {
+  id: string
+  object: "chat.completion"
+  created: number
+  model: string
+  choices: Array<{
+    index: number
+    finish_reason: string | null
+    message: { role: "assistant"; content: string; tool_calls: unknown }
+  }>
+  usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number }
+}
+
+export class LLMProxyError extends Error {
+  readonly code: LLMProxyErrorCode
+  readonly status: number
+  readonly retryAfter?: number
+
+  constructor(code: LLMProxyErrorCode, message: string, status: number, retryAfter?: number) {
+    super(message)
+    this.name = "LLMProxyError"
+    this.code = code
+    this.status = status
+    this.retryAfter = retryAfter
+  }
+}
+
+/**
+ * Non-streaming chat completion via the proxy. The streaming path is owned
+ * by @crema/llm-providers-ui's buildAdapter; use this for probes and
+ * one-shot calls where SSE is overkill.
+ */
+export async function chat(
+  arcadia: ArcadiaClient,
+  req: LLMProxyChatRequest,
+): Promise<LLMProxyChatResponse> {
+  try {
+    const res = await arcadia.POST<LLMProxyChatResponse>(
+      "/api/v1/ai/llm/chat",
+      { body: { ...req, stream: false } },
+    )
+    return res
+  } catch (e) {
+    throw asProxyError(e)
+  }
+}
+
+/**
+ * Cheap end-to-end probe for the Settings "Test connection" flow in proxy
+ * mode. Sends a 1-token "ping" and reports whether the proxy is wired,
+ * the secret resolves, and the upstream answered. Intentionally tolerant
+ * of token-budget rejections — those still prove the round-trip works.
+ */
+export async function probeProxy(
+  arcadia: ArcadiaClient,
+  opts: { provider: LLMProxyProvider; model: string; secretName?: string },
+): Promise<{ ok: boolean; message: string }> {
+  try {
+    const res = await chat(arcadia, {
+      provider: opts.provider,
+      secret_name: opts.secretName,
+      model: opts.model,
+      messages: [{ role: "user", content: "ping" }],
+      max_tokens: 1,
+      stream: false,
+    })
+    const used = res.usage?.total_tokens
+    return {
+      ok: true,
+      message: `Proxy OK — ${res.model}${used != null ? ` · ${used} tokens` : ""}.`,
+    }
+  } catch (e) {
+    if (e instanceof LLMProxyError) {
+      return { ok: false, message: friendly(e) }
+    }
+    return { ok: false, message: e instanceof Error ? e.message : String(e) }
+  }
+}
+
+function asProxyError(e: unknown): LLMProxyError {
+  // ArcadiaClient throws ArcadiaError with a wrapped { error: { code, message } }
+  // body and HTTP status. Best-effort destructure without coupling to the
+  // class shape (it lives in a sibling lib).
+  if (e && typeof e === "object") {
+    const anyE = e as {
+      status?: number
+      code?: string
+      message?: string
+      body?: { error?: { code?: string; message?: string } }
+      headers?: Headers | Record<string, string>
+    }
+    const status = anyE.status ?? 0
+    const code = (anyE.body?.error?.code ?? anyE.code) as LLMProxyErrorCode | undefined
+    const message = anyE.body?.error?.message ?? anyE.message ?? "Proxy request failed."
+    const retryAfter = readRetryAfter(anyE.headers)
+    return new LLMProxyError(code ?? inferCodeFromStatus(status), message, status, retryAfter)
+  }
+  return new LLMProxyError("unknown", String(e), 0)
+}
+
+function inferCodeFromStatus(status: number): LLMProxyErrorCode {
+  if (status === 401) return "unauthorized"
+  if (status === 403) return "ip_not_allowed"
+  if (status === 404) return "unknown_provider"
+  if (status === 410) return "secret_expired"
+  if (status === 429) return "rate_limited"
+  if (status === 502 || status === 503 || status === 504) return "upstream_unavailable"
+  return "unknown"
+}
+
+function readRetryAfter(h: Headers | Record<string, string> | undefined): number | undefined {
+  if (!h) return undefined
+  const raw = h instanceof Headers ? h.get("retry-after") : h["retry-after"] ?? h["Retry-After"]
+  if (!raw) return undefined
+  const n = Number(raw)
+  return Number.isFinite(n) ? n : undefined
+}
+
+export function friendly(err: LLMProxyError): string {
+  switch (err.code) {
+    case "unauthorized":
+      return "Sign in expired — refresh and try again."
+    case "secret_disabled":
+      return "The vault secret is disabled. Re-enable it under /secrets."
+    case "secret_expired":
+      return "The vault secret has expired. Rotate it under /secrets."
+    case "secret_consumed":
+      return "Read-once secret already used. Rotate it under /secrets."
+    case "ip_not_allowed":
+      return "This client's IP is blocked by the secret's allowlist."
+    case "unknown_provider":
+      return "The proxy doesn't recognise this provider. Check the provider id."
+    case "upstream_unavailable":
+      return "The upstream LLM provider returned an error or timed out."
+    case "rate_limited":
+      return err.retryAfter
+        ? `Rate limited. Retry in ${err.retryAfter}s.`
+        : "Rate limited — slow down and try again."
+    default:
+      return err.message
+  }
+}