Wire health probes, host stats, and LLM proxy round-trip
Three things from the latest arcadia-app pull:
- health.ts: client for /api/v1/health{,/:service,/detailed,/host}.
monitoring.tsx now reads real per-subsystem probe state instead of
synthesizing it from indirect signals (rate limits, sessions, jobs).
- New Host tab on Monitoring with KPI tiles + per-core CPU bars,
load-avg cards, memory + swap usage, and per-mount disk bars,
backed by /api/v1/health/host.
- llm-proxy.ts: typed errors (secret_disabled, ip_not_allowed, etc.)
and a probeProxy() that round-trips a 1-token chat. settings.tsx's
"Test connection" in proxy mode now exercises the real endpoint
instead of just confirming the adapter built. Contract doc flipped
from "not yet implemented" to "implemented".
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
94
app/lib/arcadia/health.ts
Normal file
94
app/lib/arcadia/health.ts
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
// Arcadia health probes.
|
||||||
|
//
|
||||||
|
// Backed by /api/v1/health* (public — no auth). Each subsystem is probed
|
||||||
|
// independently; the overall endpoint aggregates and returns 503 if any
|
||||||
|
// subsystem is not "ok". See arcadia-app commit f427892.
|
||||||
|
|
||||||
|
import type { ArcadiaClient } from "@crema/arcadia-client"
|
||||||
|
|
||||||
|
export type HealthSubsystem = "api" | "db" | "workers" | "storage"
|
||||||
|
|
||||||
|
export type HealthStatus = "ok" | "degraded" | "error" | "unconfigured"
|
||||||
|
|
||||||
|
export interface SubsystemHealth {
|
||||||
|
status: HealthStatus
|
||||||
|
/** Optional human-readable detail. */
|
||||||
|
message?: string
|
||||||
|
/** Free-form metrics — shape is subsystem-specific. */
|
||||||
|
details?: Record<string, unknown>
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OverallHealth {
|
||||||
|
status: HealthStatus
|
||||||
|
checked_at: string
|
||||||
|
subsystems: Record<HealthSubsystem, SubsystemHealth>
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DetailedHealth extends OverallHealth {
|
||||||
|
/** BEAM info — present on /health/detailed only. */
|
||||||
|
system?: {
|
||||||
|
otp_release?: string
|
||||||
|
elixir_version?: string
|
||||||
|
process_count?: number
|
||||||
|
memory_total_bytes?: number
|
||||||
|
[k: string]: unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface HostStats {
|
||||||
|
cpu: {
|
||||||
|
util_pct: number | null
|
||||||
|
per_cpu_pct: number[]
|
||||||
|
load_avg_1: number | null
|
||||||
|
load_avg_5: number | null
|
||||||
|
load_avg_15: number | null
|
||||||
|
schedulers_online: number
|
||||||
|
num_cpus: number | null
|
||||||
|
}
|
||||||
|
memory: {
|
||||||
|
total_bytes: number | null
|
||||||
|
free_bytes: number | null
|
||||||
|
available_bytes: number | null
|
||||||
|
buffered_bytes: number | null
|
||||||
|
cached_bytes: number | null
|
||||||
|
swap_total_bytes: number | null
|
||||||
|
swap_free_bytes: number | null
|
||||||
|
}
|
||||||
|
disks: Array<{ mount: string; total_kb: number; used_pct: number }>
|
||||||
|
checked_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
const BASE = "/api/v1/health"
|
||||||
|
|
||||||
|
export async function getHealth(arcadia: ArcadiaClient): Promise<OverallHealth> {
|
||||||
|
const res = await arcadia.GET<{ data: OverallHealth } | OverallHealth>(BASE)
|
||||||
|
return unwrap(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getServiceHealth(
|
||||||
|
arcadia: ArcadiaClient,
|
||||||
|
service: HealthSubsystem,
|
||||||
|
): Promise<SubsystemHealth> {
|
||||||
|
const res = await arcadia.GET<{ data: SubsystemHealth } | SubsystemHealth>(
|
||||||
|
`${BASE}/${service}`,
|
||||||
|
)
|
||||||
|
return unwrap(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getHealthDetailed(arcadia: ArcadiaClient): Promise<DetailedHealth> {
|
||||||
|
const res = await arcadia.GET<{ data: DetailedHealth } | DetailedHealth>(`${BASE}/detailed`)
|
||||||
|
return unwrap(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getHostStats(arcadia: ArcadiaClient): Promise<HostStats> {
|
||||||
|
const res = await arcadia.GET<{ data: HostStats } | HostStats>(`${BASE}/host`)
|
||||||
|
return unwrap(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
export const SUBSYSTEMS: HealthSubsystem[] = ["api", "db", "workers", "storage"]
|
||||||
|
|
||||||
|
function unwrap<T>(res: { data: T } | T): T {
|
||||||
|
return res && typeof res === "object" && "data" in (res as object)
|
||||||
|
? (res as { data: T }).data
|
||||||
|
: (res as T)
|
||||||
|
}
|
||||||
182
app/lib/arcadia/llm-proxy.ts
Normal file
182
app/lib/arcadia/llm-proxy.ts
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
// Arcadia LLM proxy client.
|
||||||
|
//
|
||||||
|
// Implements the spec in docs/LLM_PROXY_CONTRACT.md against arcadia-app's
|
||||||
|
// POST /api/v1/ai/llm/chat. The lib (@crema/llm-providers-ui buildAdapter)
|
||||||
|
// owns the streaming chat path itself; this module exposes a lightweight
|
||||||
|
// non-streaming probe so the Settings "Test connection" button can verify
|
||||||
|
// the proxy round-trips end-to-end (auth → secret resolution → upstream
|
||||||
|
// dispatch → response shape).
|
||||||
|
|
||||||
|
import type { ArcadiaClient } from "@crema/arcadia-client"
|
||||||
|
|
||||||
|
export type LLMProxyProvider =
|
||||||
|
| "openai"
|
||||||
|
| "anthropic"
|
||||||
|
| "deepseek"
|
||||||
|
| "qwen"
|
||||||
|
| "lmstudio"
|
||||||
|
|
||||||
|
export type LLMProxyErrorCode =
|
||||||
|
| "unauthorized"
|
||||||
|
| "secret_disabled"
|
||||||
|
| "secret_expired"
|
||||||
|
| "secret_consumed"
|
||||||
|
| "ip_not_allowed"
|
||||||
|
| "unknown_provider"
|
||||||
|
| "upstream_unavailable"
|
||||||
|
| "rate_limited"
|
||||||
|
| "unknown"
|
||||||
|
|
||||||
|
export interface LLMProxyChatRequest {
|
||||||
|
provider: LLMProxyProvider
|
||||||
|
/** Required for every provider except `lmstudio`. */
|
||||||
|
secret_name?: string
|
||||||
|
model: string
|
||||||
|
messages: Array<{ role: "system" | "user" | "assistant"; content: string }>
|
||||||
|
stream?: boolean
|
||||||
|
max_tokens?: number
|
||||||
|
temperature?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LLMProxyChatResponse {
|
||||||
|
id: string
|
||||||
|
object: "chat.completion"
|
||||||
|
created: number
|
||||||
|
model: string
|
||||||
|
choices: Array<{
|
||||||
|
index: number
|
||||||
|
finish_reason: string | null
|
||||||
|
message: { role: "assistant"; content: string; tool_calls: unknown }
|
||||||
|
}>
|
||||||
|
usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number }
|
||||||
|
}
|
||||||
|
|
||||||
|
export class LLMProxyError extends Error {
|
||||||
|
readonly code: LLMProxyErrorCode
|
||||||
|
readonly status: number
|
||||||
|
readonly retryAfter?: number
|
||||||
|
|
||||||
|
constructor(code: LLMProxyErrorCode, message: string, status: number, retryAfter?: number) {
|
||||||
|
super(message)
|
||||||
|
this.name = "LLMProxyError"
|
||||||
|
this.code = code
|
||||||
|
this.status = status
|
||||||
|
this.retryAfter = retryAfter
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Non-streaming chat completion via the proxy. The streaming path is owned
|
||||||
|
* by @crema/llm-providers-ui's buildAdapter; use this for probes and
|
||||||
|
* one-shot calls where SSE is overkill.
|
||||||
|
*/
|
||||||
|
export async function chat(
|
||||||
|
arcadia: ArcadiaClient,
|
||||||
|
req: LLMProxyChatRequest,
|
||||||
|
): Promise<LLMProxyChatResponse> {
|
||||||
|
try {
|
||||||
|
const res = await arcadia.POST<LLMProxyChatResponse>(
|
||||||
|
"/api/v1/ai/llm/chat",
|
||||||
|
{ body: { ...req, stream: false } },
|
||||||
|
)
|
||||||
|
return res
|
||||||
|
} catch (e) {
|
||||||
|
throw asProxyError(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cheap end-to-end probe for the Settings "Test connection" flow in proxy
|
||||||
|
* mode. Sends a 1-token "ping" and reports whether the proxy is wired,
|
||||||
|
* the secret resolves, and the upstream answered. Intentionally tolerant
|
||||||
|
* of token-budget rejections — those still prove the round-trip works.
|
||||||
|
*/
|
||||||
|
export async function probeProxy(
|
||||||
|
arcadia: ArcadiaClient,
|
||||||
|
opts: { provider: LLMProxyProvider; model: string; secretName?: string },
|
||||||
|
): Promise<{ ok: boolean; message: string }> {
|
||||||
|
try {
|
||||||
|
const res = await chat(arcadia, {
|
||||||
|
provider: opts.provider,
|
||||||
|
secret_name: opts.secretName,
|
||||||
|
model: opts.model,
|
||||||
|
messages: [{ role: "user", content: "ping" }],
|
||||||
|
max_tokens: 1,
|
||||||
|
stream: false,
|
||||||
|
})
|
||||||
|
const used = res.usage?.total_tokens
|
||||||
|
return {
|
||||||
|
ok: true,
|
||||||
|
message: `Proxy OK — ${res.model}${used != null ? ` · ${used} tokens` : ""}.`,
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
if (e instanceof LLMProxyError) {
|
||||||
|
return { ok: false, message: friendly(e) }
|
||||||
|
}
|
||||||
|
return { ok: false, message: e instanceof Error ? e.message : String(e) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function asProxyError(e: unknown): LLMProxyError {
|
||||||
|
// ArcadiaClient throws ArcadiaError with a wrapped { error: { code, message } }
|
||||||
|
// body and HTTP status. Best-effort destructure without coupling to the
|
||||||
|
// class shape (it lives in a sibling lib).
|
||||||
|
if (e && typeof e === "object") {
|
||||||
|
const anyE = e as {
|
||||||
|
status?: number
|
||||||
|
code?: string
|
||||||
|
message?: string
|
||||||
|
body?: { error?: { code?: string; message?: string } }
|
||||||
|
headers?: Headers | Record<string, string>
|
||||||
|
}
|
||||||
|
const status = anyE.status ?? 0
|
||||||
|
const code = (anyE.body?.error?.code ?? anyE.code) as LLMProxyErrorCode | undefined
|
||||||
|
const message = anyE.body?.error?.message ?? anyE.message ?? "Proxy request failed."
|
||||||
|
const retryAfter = readRetryAfter(anyE.headers)
|
||||||
|
return new LLMProxyError(code ?? inferCodeFromStatus(status), message, status, retryAfter)
|
||||||
|
}
|
||||||
|
return new LLMProxyError("unknown", String(e), 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
function inferCodeFromStatus(status: number): LLMProxyErrorCode {
|
||||||
|
if (status === 401) return "unauthorized"
|
||||||
|
if (status === 403) return "ip_not_allowed"
|
||||||
|
if (status === 404) return "unknown_provider"
|
||||||
|
if (status === 410) return "secret_expired"
|
||||||
|
if (status === 429) return "rate_limited"
|
||||||
|
if (status === 502 || status === 503 || status === 504) return "upstream_unavailable"
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
function readRetryAfter(h: Headers | Record<string, string> | undefined): number | undefined {
|
||||||
|
if (!h) return undefined
|
||||||
|
const raw = h instanceof Headers ? h.get("retry-after") : h["retry-after"] ?? h["Retry-After"]
|
||||||
|
if (!raw) return undefined
|
||||||
|
const n = Number(raw)
|
||||||
|
return Number.isFinite(n) ? n : undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
export function friendly(err: LLMProxyError): string {
|
||||||
|
switch (err.code) {
|
||||||
|
case "unauthorized":
|
||||||
|
return "Sign in expired — refresh and try again."
|
||||||
|
case "secret_disabled":
|
||||||
|
return "The vault secret is disabled. Re-enable it under /secrets."
|
||||||
|
case "secret_expired":
|
||||||
|
return "The vault secret has expired. Rotate it under /secrets."
|
||||||
|
case "secret_consumed":
|
||||||
|
return "Read-once secret already used. Rotate it under /secrets."
|
||||||
|
case "ip_not_allowed":
|
||||||
|
return "This client's IP is blocked by the secret's allowlist."
|
||||||
|
case "unknown_provider":
|
||||||
|
return "The proxy doesn't recognise this provider. Check the provider id."
|
||||||
|
case "upstream_unavailable":
|
||||||
|
return "The upstream LLM provider returned an error or timed out."
|
||||||
|
case "rate_limited":
|
||||||
|
return err.retryAfter
|
||||||
|
? `Rate limited. Retry in ${err.retryAfter}s.`
|
||||||
|
: "Rate limited — slow down and try again."
|
||||||
|
default:
|
||||||
|
return err.message
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -71,6 +71,15 @@ import {
|
|||||||
type RateLimit,
|
type RateLimit,
|
||||||
type Space,
|
type Space,
|
||||||
} from "~/lib/arcadia/monitoring"
|
} from "~/lib/arcadia/monitoring"
|
||||||
|
import {
|
||||||
|
getHealth,
|
||||||
|
getHostStats,
|
||||||
|
SUBSYSTEMS,
|
||||||
|
type HealthSubsystem,
|
||||||
|
type HostStats,
|
||||||
|
type OverallHealth,
|
||||||
|
type SubsystemHealth,
|
||||||
|
} from "~/lib/arcadia/health"
|
||||||
import { pageTitle } from "~/lib/page-meta"
|
import { pageTitle } from "~/lib/page-meta"
|
||||||
import { useSession } from "~/lib/session"
|
import { useSession } from "~/lib/session"
|
||||||
import { useRegisterAdminContext } from "~/lib/admin-context"
|
import { useRegisterAdminContext } from "~/lib/admin-context"
|
||||||
@@ -86,6 +95,8 @@ interface DashboardData {
|
|||||||
spaces: Space[]
|
spaces: Space[]
|
||||||
droplets: Droplet[]
|
droplets: Droplet[]
|
||||||
auditStats: AuditStats | null
|
auditStats: AuditStats | null
|
||||||
|
health: OverallHealth | null
|
||||||
|
host: HostStats | null
|
||||||
}
|
}
|
||||||
|
|
||||||
const EMPTY: DashboardData = {
|
const EMPTY: DashboardData = {
|
||||||
@@ -97,6 +108,8 @@ const EMPTY: DashboardData = {
|
|||||||
spaces: [],
|
spaces: [],
|
||||||
droplets: [],
|
droplets: [],
|
||||||
auditStats: null,
|
auditStats: null,
|
||||||
|
health: null,
|
||||||
|
host: null,
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function MonitoringRoute() {
|
export default function MonitoringRoute() {
|
||||||
@@ -121,6 +134,8 @@ export default function MonitoringRoute() {
|
|||||||
spaces,
|
spaces,
|
||||||
droplets,
|
droplets,
|
||||||
auditStats,
|
auditStats,
|
||||||
|
health,
|
||||||
|
host,
|
||||||
] = await Promise.all([
|
] = await Promise.all([
|
||||||
getJobStats(arcadia).catch(() => null),
|
getJobStats(arcadia).catch(() => null),
|
||||||
getRecentJobs(arcadia, { limit: 50 }).catch(() => []),
|
getRecentJobs(arcadia, { limit: 50 }).catch(() => []),
|
||||||
@@ -132,6 +147,8 @@ export default function MonitoringRoute() {
|
|||||||
getAuditStats(arcadia, {
|
getAuditStats(arcadia, {
|
||||||
from: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString(),
|
from: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString(),
|
||||||
}).catch(() => null),
|
}).catch(() => null),
|
||||||
|
getHealth(arcadia).catch(() => null),
|
||||||
|
getHostStats(arcadia).catch(() => null),
|
||||||
])
|
])
|
||||||
setData({
|
setData({
|
||||||
jobStats,
|
jobStats,
|
||||||
@@ -142,6 +159,8 @@ export default function MonitoringRoute() {
|
|||||||
spaces,
|
spaces,
|
||||||
droplets,
|
droplets,
|
||||||
auditStats,
|
auditStats,
|
||||||
|
health,
|
||||||
|
host,
|
||||||
})
|
})
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
setError(err instanceof ArcadiaError ? err.message : "Failed to load monitoring data.")
|
setError(err instanceof ArcadiaError ? err.message : "Failed to load monitoring data.")
|
||||||
@@ -241,7 +260,13 @@ export default function MonitoringRoute() {
|
|||||||
<CardHeader className="flex flex-row items-center justify-between">
|
<CardHeader className="flex flex-row items-center justify-between">
|
||||||
<div>
|
<div>
|
||||||
<CardTitle>Service health</CardTitle>
|
<CardTitle>Service health</CardTitle>
|
||||||
<CardDescription>Derived from live signals on each subsystem.</CardDescription>
|
<CardDescription>
|
||||||
|
{data.health
|
||||||
|
? `Live probes from /api/v1/health · checked ${new Date(
|
||||||
|
data.health.checked_at,
|
||||||
|
).toLocaleTimeString()}`
|
||||||
|
: "Live probes from /api/v1/health (unavailable — backend may be down or older than the per-subsystem probe rollout)."}
|
||||||
|
</CardDescription>
|
||||||
</div>
|
</div>
|
||||||
<OverallStatus components={components} />
|
<OverallStatus components={components} />
|
||||||
</CardHeader>
|
</CardHeader>
|
||||||
@@ -282,8 +307,72 @@ export default function MonitoringRoute() {
|
|||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<Tabs defaultValue="jobs">
|
{data.host ? (
|
||||||
|
<div className="grid grid-cols-2 gap-3 md:grid-cols-4">
|
||||||
|
<KpiTile
|
||||||
|
label="CPU usage"
|
||||||
|
value={
|
||||||
|
data.host.cpu.util_pct != null
|
||||||
|
? formatPercent(data.host.cpu.util_pct / 100)
|
||||||
|
: "—"
|
||||||
|
}
|
||||||
|
icon={<Cpu className="size-4" />}
|
||||||
|
tone={
|
||||||
|
(data.host.cpu.util_pct ?? 0) > 90
|
||||||
|
? "negative"
|
||||||
|
: (data.host.cpu.util_pct ?? 0) > 70
|
||||||
|
? "warning"
|
||||||
|
: "neutral"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
<KpiTile
|
||||||
|
label="Load avg (1m)"
|
||||||
|
value={
|
||||||
|
data.host.cpu.load_avg_1 != null
|
||||||
|
? data.host.cpu.load_avg_1.toFixed(2)
|
||||||
|
: "—"
|
||||||
|
}
|
||||||
|
icon={<Activity className="size-4" />}
|
||||||
|
tone={
|
||||||
|
data.host.cpu.load_avg_1 != null &&
|
||||||
|
data.host.cpu.num_cpus &&
|
||||||
|
data.host.cpu.load_avg_1 > data.host.cpu.num_cpus
|
||||||
|
? "warning"
|
||||||
|
: "neutral"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
<KpiTile
|
||||||
|
label="Memory used"
|
||||||
|
value={memoryUsedLabel(data.host.memory)}
|
||||||
|
icon={<HardDrive className="size-4" />}
|
||||||
|
tone={
|
||||||
|
memoryUsedPct(data.host.memory) > 90
|
||||||
|
? "negative"
|
||||||
|
: memoryUsedPct(data.host.memory) > 75
|
||||||
|
? "warning"
|
||||||
|
: "neutral"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
<KpiTile
|
||||||
|
label="Disk (busiest mount)"
|
||||||
|
value={busiestDiskLabel(data.host.disks)}
|
||||||
|
icon={<Database className="size-4" />}
|
||||||
|
tone={
|
||||||
|
busiestDiskPct(data.host.disks) > 90
|
||||||
|
? "negative"
|
||||||
|
: busiestDiskPct(data.host.disks) > 75
|
||||||
|
? "warning"
|
||||||
|
: "neutral"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
|
|
||||||
|
<Tabs defaultValue="host">
|
||||||
<TabsList>
|
<TabsList>
|
||||||
|
<TabsTrigger value="host" data-action="monitoring-tab-host">
|
||||||
|
Host
|
||||||
|
</TabsTrigger>
|
||||||
<TabsTrigger value="jobs" data-action="monitoring-tab-jobs">
|
<TabsTrigger value="jobs" data-action="monitoring-tab-jobs">
|
||||||
Background jobs
|
Background jobs
|
||||||
</TabsTrigger>
|
</TabsTrigger>
|
||||||
@@ -301,6 +390,10 @@ export default function MonitoringRoute() {
|
|||||||
</TabsTrigger>
|
</TabsTrigger>
|
||||||
</TabsList>
|
</TabsList>
|
||||||
|
|
||||||
|
<TabsContent value="host" className="pt-4">
|
||||||
|
<HostPanel host={data.host} />
|
||||||
|
</TabsContent>
|
||||||
|
|
||||||
<TabsContent value="jobs" className="pt-4">
|
<TabsContent value="jobs" className="pt-4">
|
||||||
<JobsPanel
|
<JobsPanel
|
||||||
stats={data.jobStats}
|
stats={data.jobStats}
|
||||||
@@ -343,47 +436,292 @@ export default function MonitoringRoute() {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synthesize a status board from the live signals we have.
|
// Map arcadia /health probe results onto the status-ui component model.
|
||||||
|
// "ok" → operational, "degraded" → degraded, "error" → partial-outage,
|
||||||
|
// "unconfigured" → operational (storage with no configured backend is ok).
|
||||||
function buildStatusComponents(d: DashboardData): StatusComponent[] {
|
function buildStatusComponents(d: DashboardData): StatusComponent[] {
|
||||||
const apiOk = d.rateLimits.length > 0
|
const subsystems = d.health?.subsystems
|
||||||
const dbOk = d.sessions !== null
|
const meta: Record<HealthSubsystem, { name: string; description: string }> = {
|
||||||
const workersState: ComponentState = (() => {
|
api: { name: "API", description: "/api/v1 — auth, REST endpoints" },
|
||||||
if (!d.jobStats) return "partial-outage"
|
db: { name: "Database", description: "Postgres — sessions, audit log" },
|
||||||
const r = d.jobStats.counts.retryable ?? 0
|
workers: {
|
||||||
const x = d.jobStats.counts.discarded ?? 0
|
|
||||||
if (x > 100) return "major-outage"
|
|
||||||
if (r > 50 || x > 0) return "degraded"
|
|
||||||
return "operational"
|
|
||||||
})()
|
|
||||||
const storageState: ComponentState =
|
|
||||||
d.spaces.length > 0 || d.infraSummary ? "operational" : "partial-outage"
|
|
||||||
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
id: "api",
|
|
||||||
name: "API",
|
|
||||||
description: "/api/v1 — auth, REST endpoints",
|
|
||||||
state: apiOk ? "operational" : "partial-outage",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "db",
|
|
||||||
name: "Database",
|
|
||||||
description: "Postgres — sessions, audit log",
|
|
||||||
state: dbOk ? "operational" : "partial-outage",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "workers",
|
|
||||||
name: "Background workers",
|
name: "Background workers",
|
||||||
description: "Oban — webhook delivery, scheduled tasks",
|
description: "Oban — webhook delivery, scheduled tasks",
|
||||||
state: workersState,
|
|
||||||
},
|
},
|
||||||
{
|
storage: {
|
||||||
id: "storage",
|
|
||||||
name: "Storage",
|
name: "Storage",
|
||||||
description: "DigitalOcean Spaces / S3-compatible object storage",
|
description: "S3-compatible object storage (per platform default)",
|
||||||
state: storageState,
|
|
||||||
},
|
},
|
||||||
]
|
}
|
||||||
|
|
||||||
|
return SUBSYSTEMS.map((id) => {
|
||||||
|
const probe = subsystems?.[id]
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
name: meta[id].name,
|
||||||
|
description: probe?.message ?? meta[id].description,
|
||||||
|
state: probe ? mapHealthState(probe) : "partial-outage",
|
||||||
|
} satisfies StatusComponent
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function mapHealthState(probe: SubsystemHealth): ComponentState {
|
||||||
|
switch (probe.status) {
|
||||||
|
case "ok":
|
||||||
|
case "unconfigured":
|
||||||
|
return "operational"
|
||||||
|
case "degraded":
|
||||||
|
return "degraded"
|
||||||
|
case "error":
|
||||||
|
return "major-outage"
|
||||||
|
default:
|
||||||
|
return "partial-outage"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Host panel --------------------------------------------------------
|
||||||
|
|
||||||
|
function HostPanel({ host }: { host: HostStats | null }) {
|
||||||
|
if (!host) {
|
||||||
|
return (
|
||||||
|
<PanelStub
|
||||||
|
icon={<Cpu className="size-5" />}
|
||||||
|
text="Host stats unavailable. The /api/v1/health/host endpoint may not be deployed yet, or os_mon daemons aren't reachable."
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const memUsed = memoryUsedBytes(host.memory)
|
||||||
|
const memTotal = host.memory.total_bytes ?? null
|
||||||
|
const memPct = memoryUsedPct(host.memory)
|
||||||
|
const swapTotal = host.memory.swap_total_bytes ?? null
|
||||||
|
const swapUsed =
|
||||||
|
swapTotal != null && host.memory.swap_free_bytes != null
|
||||||
|
? swapTotal - host.memory.swap_free_bytes
|
||||||
|
: null
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col gap-4">
|
||||||
|
{/* CPU + load */}
|
||||||
|
<div className="grid grid-cols-1 gap-3 lg:grid-cols-2">
|
||||||
|
<Card>
|
||||||
|
<CardHeader>
|
||||||
|
<CardTitle className="text-base">CPU</CardTitle>
|
||||||
|
<CardDescription>
|
||||||
|
{host.cpu.num_cpus
|
||||||
|
? `${host.cpu.num_cpus} cores · ${host.cpu.schedulers_online} BEAM schedulers online`
|
||||||
|
: `${host.cpu.schedulers_online} BEAM schedulers online`}
|
||||||
|
</CardDescription>
|
||||||
|
</CardHeader>
|
||||||
|
<CardContent className="flex flex-col gap-3">
|
||||||
|
<UsageBar
|
||||||
|
label="Overall utilisation"
|
||||||
|
pct={host.cpu.util_pct ?? null}
|
||||||
|
valueText={
|
||||||
|
host.cpu.util_pct != null ? `${host.cpu.util_pct.toFixed(1)}%` : "—"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
{host.cpu.per_cpu_pct.length > 0 ? (
|
||||||
|
<div className="flex flex-col gap-1">
|
||||||
|
<span className="text-xs text-muted-foreground">Per core</span>
|
||||||
|
<div className="flex flex-wrap gap-1.5">
|
||||||
|
{host.cpu.per_cpu_pct.map((p, i) => (
|
||||||
|
<div
|
||||||
|
key={i}
|
||||||
|
className="flex h-6 w-12 items-center justify-center rounded text-[11px] font-mono"
|
||||||
|
style={{
|
||||||
|
background: `linear-gradient(to right, var(--primary) ${p}%, var(--muted) ${p}%)`,
|
||||||
|
color: p > 50 ? "var(--primary-foreground)" : "var(--foreground)",
|
||||||
|
}}
|
||||||
|
title={`Core ${i}: ${p.toFixed(1)}%`}
|
||||||
|
>
|
||||||
|
{p.toFixed(0)}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
|
||||||
|
<Card>
|
||||||
|
<CardHeader>
|
||||||
|
<CardTitle className="text-base">Load average</CardTitle>
|
||||||
|
<CardDescription>
|
||||||
|
Unix-style load average. A value above the core count means the
|
||||||
|
run-queue is saturated.
|
||||||
|
</CardDescription>
|
||||||
|
</CardHeader>
|
||||||
|
<CardContent>
|
||||||
|
<div className="grid grid-cols-3 gap-3">
|
||||||
|
<LoadAvgCell label="1 min" value={host.cpu.load_avg_1} cores={host.cpu.num_cpus} />
|
||||||
|
<LoadAvgCell label="5 min" value={host.cpu.load_avg_5} cores={host.cpu.num_cpus} />
|
||||||
|
<LoadAvgCell label="15 min" value={host.cpu.load_avg_15} cores={host.cpu.num_cpus} />
|
||||||
|
</div>
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Memory */}
|
||||||
|
<Card>
|
||||||
|
<CardHeader>
|
||||||
|
<CardTitle className="text-base">Memory</CardTitle>
|
||||||
|
<CardDescription>
|
||||||
|
{memTotal != null ? `${formatBytes(memTotal)} total` : "Total memory unknown"}
|
||||||
|
{host.memory.available_bytes != null
|
||||||
|
? ` · ${formatBytes(host.memory.available_bytes)} available`
|
||||||
|
: ""}
|
||||||
|
</CardDescription>
|
||||||
|
</CardHeader>
|
||||||
|
<CardContent className="flex flex-col gap-3">
|
||||||
|
<UsageBar
|
||||||
|
label="Used"
|
||||||
|
pct={memPct}
|
||||||
|
valueText={
|
||||||
|
memUsed != null && memTotal != null
|
||||||
|
? `${formatBytes(memUsed)} / ${formatBytes(memTotal)} (${memPct.toFixed(1)}%)`
|
||||||
|
: "—"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
{(host.memory.buffered_bytes != null || host.memory.cached_bytes != null) && (
|
||||||
|
<div className="grid grid-cols-2 gap-2 text-xs text-muted-foreground">
|
||||||
|
{host.memory.buffered_bytes != null && (
|
||||||
|
<span>Buffered: {formatBytes(host.memory.buffered_bytes)}</span>
|
||||||
|
)}
|
||||||
|
{host.memory.cached_bytes != null && (
|
||||||
|
<span>Cached: {formatBytes(host.memory.cached_bytes)}</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{swapTotal != null && swapTotal > 0 ? (
|
||||||
|
<UsageBar
|
||||||
|
label="Swap"
|
||||||
|
pct={swapUsed != null ? (swapUsed / swapTotal) * 100 : null}
|
||||||
|
valueText={
|
||||||
|
swapUsed != null
|
||||||
|
? `${formatBytes(swapUsed)} / ${formatBytes(swapTotal)}`
|
||||||
|
: "—"
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
) : null}
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
|
||||||
|
{/* Disks */}
|
||||||
|
<Card>
|
||||||
|
<CardHeader>
|
||||||
|
<CardTitle className="text-base">Disks</CardTitle>
|
||||||
|
<CardDescription>One row per mount point.</CardDescription>
|
||||||
|
</CardHeader>
|
||||||
|
<CardContent>
|
||||||
|
{host.disks.length === 0 ? (
|
||||||
|
<p className="py-4 text-sm text-muted-foreground">No disks reported.</p>
|
||||||
|
) : (
|
||||||
|
<div className="flex flex-col gap-2">
|
||||||
|
{host.disks.map((d) => (
|
||||||
|
<UsageBar
|
||||||
|
key={d.mount}
|
||||||
|
label={d.mount}
|
||||||
|
pct={d.used_pct}
|
||||||
|
valueText={`${d.used_pct}% of ${formatBytes(d.total_kb * 1024)}`}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function UsageBar({
|
||||||
|
label,
|
||||||
|
pct,
|
||||||
|
valueText,
|
||||||
|
}: {
|
||||||
|
label: string
|
||||||
|
pct: number | null
|
||||||
|
valueText: string
|
||||||
|
}) {
|
||||||
|
const clamped = pct == null ? 0 : Math.max(0, Math.min(100, pct))
|
||||||
|
const tone = pct == null ? "var(--muted-foreground)" : barColor(pct)
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col gap-1">
|
||||||
|
<div className="flex items-baseline justify-between gap-2 text-xs">
|
||||||
|
<span className="font-medium">{label}</span>
|
||||||
|
<span className="font-mono text-muted-foreground">{valueText}</span>
|
||||||
|
</div>
|
||||||
|
<div className="h-2 w-full overflow-hidden rounded-full bg-muted">
|
||||||
|
<div
|
||||||
|
className="h-full rounded-full transition-[width] duration-500"
|
||||||
|
style={{ width: `${clamped}%`, background: tone }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function LoadAvgCell({
|
||||||
|
label,
|
||||||
|
value,
|
||||||
|
cores,
|
||||||
|
}: {
|
||||||
|
label: string
|
||||||
|
value: number | null
|
||||||
|
cores: number | null
|
||||||
|
}) {
|
||||||
|
const saturated = value != null && cores != null && value > cores
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col items-start gap-0.5 rounded-md border bg-card p-3">
|
||||||
|
<span className="text-xs text-muted-foreground">{label}</span>
|
||||||
|
<span
|
||||||
|
className="font-mono text-2xl font-semibold tabular-nums"
|
||||||
|
style={{ color: saturated ? "var(--destructive)" : "var(--foreground)" }}
|
||||||
|
>
|
||||||
|
{value != null ? value.toFixed(2) : "—"}
|
||||||
|
</span>
|
||||||
|
{cores ? (
|
||||||
|
<span className="text-[11px] text-muted-foreground">/ {cores} cores</span>
|
||||||
|
) : null}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function memoryUsedBytes(m: HostStats["memory"]): number | null {
|
||||||
|
if (m.total_bytes == null) return null
|
||||||
|
// Prefer "available" over "free" — on Linux, free excludes reclaimable
|
||||||
|
// buffer/cache memory and overstates pressure.
|
||||||
|
const available = m.available_bytes ?? m.free_bytes
|
||||||
|
if (available == null) return null
|
||||||
|
return Math.max(0, m.total_bytes - available)
|
||||||
|
}
|
||||||
|
|
||||||
|
function memoryUsedPct(m: HostStats["memory"]): number {
|
||||||
|
const used = memoryUsedBytes(m)
|
||||||
|
if (used == null || m.total_bytes == null || m.total_bytes === 0) return 0
|
||||||
|
return (used / m.total_bytes) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
function memoryUsedLabel(m: HostStats["memory"]): string {
|
||||||
|
const used = memoryUsedBytes(m)
|
||||||
|
if (used == null || m.total_bytes == null) return "—"
|
||||||
|
return `${formatBytes(used)} / ${formatBytes(m.total_bytes)}`
|
||||||
|
}
|
||||||
|
|
||||||
|
function busiestDiskPct(disks: HostStats["disks"]): number {
|
||||||
|
return disks.reduce((m, d) => Math.max(m, d.used_pct), 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
function busiestDiskLabel(disks: HostStats["disks"]): string {
|
||||||
|
if (disks.length === 0) return "—"
|
||||||
|
const busiest = disks.reduce((a, b) => (b.used_pct > a.used_pct ? b : a))
|
||||||
|
return `${busiest.used_pct}% (${busiest.mount})`
|
||||||
|
}
|
||||||
|
|
||||||
|
function barColor(pct: number): string {
|
||||||
|
if (pct >= 90) return "var(--destructive)"
|
||||||
|
if (pct >= 75) return "#f59e0b"
|
||||||
|
return "var(--primary)"
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Jobs panel --------------------------------------------------------
|
// --- Jobs panel --------------------------------------------------------
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import {
|
|||||||
} from "@crema/llm-providers-ui"
|
} from "@crema/llm-providers-ui"
|
||||||
import { useArcadiaClient } from "@crema/arcadia-client"
|
import { useArcadiaClient } from "@crema/arcadia-client"
|
||||||
|
|
||||||
|
import { probeProxy, type LLMProxyProvider } from "~/lib/arcadia/llm-proxy"
|
||||||
import { AppShell } from "~/components/layout/app-shell"
|
import { AppShell } from "~/components/layout/app-shell"
|
||||||
import { Button } from "~/components/ui/button"
|
import { Button } from "~/components/ui/button"
|
||||||
import {
|
import {
|
||||||
@@ -98,15 +99,15 @@ export default function SettingsRoute() {
|
|||||||
arcadiaTenantId,
|
arcadiaTenantId,
|
||||||
})
|
})
|
||||||
|
|
||||||
// In proxy mode the adapter just being built is the strongest signal we
|
// Proxy mode: round-trip a 1-token chat to verify auth → secret
|
||||||
// can get without actually firing a chat request — the proxy endpoint
|
// resolution → upstream dispatch end-to-end. Maps the contract's
|
||||||
// doesn't exist on the backend yet, so any /models probe would 404.
|
// specific error codes to user-facing messages.
|
||||||
if (s.mode === "proxy") {
|
if (s.mode === "proxy") {
|
||||||
return {
|
return probeProxy(arcadia, {
|
||||||
ok: true,
|
provider: s.providerId as LLMProxyProvider,
|
||||||
message:
|
model: s.model || (s.providerId === "anthropic" ? "claude-opus-4-7" : "gpt-4o-mini"),
|
||||||
"Adapter built. Note: the backend proxy (/api/v1/ai/llm/chat) isn't deployed yet — see docs/LLM_PROXY_CONTRACT.md.",
|
secretName: s.secretName || undefined,
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Direct mode — for OpenAI-compatible endpoints, /models is a cheap probe.
|
// Direct mode — for OpenAI-compatible endpoints, /models is a cheap probe.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# LLM Proxy Contract
|
# LLM Proxy Contract
|
||||||
|
|
||||||
> **Status: not yet implemented on the backend.** This document is the contract that `lib-llm-providers-ui` expects from arcadia. Implement `POST /api/v1/ai/llm/chat` server-side to make `mode: "proxy"` work in the client.
|
> **Status: implemented.** Backend lives in `arcadia-app` at `apps/arcadia_core/lib/arcadia/ai/llm_proxy*` (see commit `75669f1`). This document remains the contract that `lib-llm-providers-ui` and `app/lib/arcadia/llm-proxy.ts` expect from arcadia — keep it in sync if either side changes.
|
||||||
|
|
||||||
## Why a proxy?
|
## Why a proxy?
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user