Phase 4a: deployment-provisioning choreography saga

Wire a full tenant deployment as one orchestrated, compensating saga:
mark → create droplet → wait active → register in inventory → link to
deployment → point DNS → activate. A failure anywhere rolls the whole
thing back — droplet destroyed, DNS reverted, deployment moved to
cancelled.

- New lifecycle state `provisioning`; deployments created via the
  provision path enter here and only reach `active` once the saga's
  ActivateDeployment step runs.
- Four new steps: MarkDeploymentProvisioning (owns the deployment's
  failure state), LinkDeploymentResource, PointDeploymentDns,
  ActivateDeployment.
- Provisioning.provision_deployment/2 assembles + starts the saga.
- DeploymentController: POST /deployments with provision:true creates
  in `provisioning` and kicks the saga (202); GET /deployments/:id now
  returns the provisioning saga + per-step progress.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 20:44:49 +10:00
parent c10b847324
commit 29f4ad97d6
8 changed files with 480 additions and 8 deletions

View File

@@ -24,6 +24,7 @@ defmodule ArcadiaCloud.Deployments do
# from_state => [allowed to_states] # from_state => [allowed to_states]
@transitions %{ @transitions %{
"provisioning" => ~w(active cancelled),
"trial" => ~w(active cancelled), "trial" => ~w(active cancelled),
"active" => ~w(paused past_due cancelled), "active" => ~w(paused past_due cancelled),
"paused" => ~w(active cancelled), "paused" => ~w(active cancelled),

View File

@@ -5,7 +5,7 @@ defmodule ArcadiaCloud.Deployments.CloudDeployment do
@primary_key {:id, :binary_id, autogenerate: true} @primary_key {:id, :binary_id, autogenerate: true}
@foreign_key_type :binary_id @foreign_key_type :binary_id
@states ~w(trial active past_due paused suspended cancelled archived) @states ~w(provisioning trial active past_due paused suspended cancelled archived)
@llm_modes ~w(managed byo none) @llm_modes ~w(managed byo none)
def states, do: @states def states, do: @states

View File

@@ -148,6 +148,54 @@ defmodule ArcadiaCloud.Provisioning do
}) })
end end
alias ArcadiaCloud.Provisioning.Steps
@doc """
Assembles + starts the full deployment-provisioning choreography saga
for a deployment that was created in the `provisioning` state.
Steps: mark → create droplet → wait active → register in inventory →
link to deployment → point DNS → activate. A failure anywhere rolls
the whole thing back (droplet destroyed, DNS reverted, deployment
moved to `cancelled`).
Required opts: :size, :image. Optional: :region (falls back to the
deployment's region), :dns_domain, :dns_record_name (falls back to the
deployment slug), :triggered_by.
"""
def provision_deployment(deployment, opts \\ []) do
region = opts[:region] || deployment.region
inputs = %{
droplet_name: opts[:droplet_name] || "dep-#{deployment.slug}",
droplet_region: region,
droplet_size: opts[:size],
droplet_image: opts[:image],
droplet_tags: [
"deployment:#{deployment.id}",
"tenant:#{deployment.tenant_id}"
],
dns_domain: opts[:dns_domain],
dns_record_name: opts[:dns_record_name] || deployment.slug
}
start_saga(%{
kind: "provision",
deployment_id: deployment.id,
triggered_by: opts[:triggered_by] || "manual",
step_modules: [
Steps.MarkDeploymentProvisioning,
Steps.CreateDroplet,
Steps.WaitDropletActive,
Steps.RegisterDroplet,
Steps.LinkDeploymentResource,
Steps.PointDeploymentDns,
Steps.ActivateDeployment
],
inputs: inputs
})
end
def get_saga(id), do: Repo.get(SagaRun, id) def get_saga(id), do: Repo.get(SagaRun, id)
def get_saga!(id), do: Repo.get!(SagaRun, id) def get_saga!(id), do: Repo.get!(SagaRun, id)

View File

@@ -0,0 +1,48 @@
defmodule ArcadiaCloud.Provisioning.Steps.ActivateDeployment do
@moduledoc """
Final step of the deployment-provisioning choreography: moves the
deployment from `provisioning` to `active` now that its infra is up,
registered, linked, and reachable by DNS.
Idempotent: `transition_state/3` returns `{:ok, deployment}` when the
deployment is already `active`.
No compensation: this is the last step, so the saga only ever rolls
back from a step BEFORE this one — meaning this step never ran and the
deployment is still `provisioning` (cancelled by
MarkDeploymentProvisioning's compensate).
"""
@behaviour ArcadiaCloud.Provisioning.Step
alias ArcadiaCloud.Deployments
@impl true
def name, do: "activate_deployment"
@impl true
def execute(state) do
with {:ok, deployment} <- fetch_deployment(state) do
case Deployments.transition_state(deployment, "active",
reason: "provisioning_complete",
actor: "saga:#{state.saga_id}"
) do
{:ok, _} -> {:ok, state}
{:error, reason} -> {:error, {:activate_failed, reason}}
end
end
end
defp fetch_deployment(state) do
case state.saga && state.saga.deployment_id do
nil ->
{:error, :saga_has_no_deployment}
id ->
case Deployments.get_deployment(id) do
nil -> {:error, :deployment_not_found}
deployment -> {:ok, deployment}
end
end
end
end

View File

@@ -0,0 +1,80 @@
defmodule ArcadiaCloud.Provisioning.Steps.LinkDeploymentResource do
@moduledoc """
Attaches the freshly-registered cloud resource (set by RegisterDroplet
as `cloud_resource_id` in context) to the saga's deployment: stamps
`deployment_id` and `tenant_id` onto the `cloud_resources` row.
This is what makes the resource show up under the deployment in
inventory and bill against the right tenant.
Idempotent: re-running just re-writes the same two columns.
Compensation: clears `deployment_id` and `tenant_id` back to nil. The
resource row itself (and the droplet) are undone by RegisterDroplet /
CreateDroplet compensation.
"""
@behaviour ArcadiaCloud.Provisioning.Step
import Ecto.Query
alias ArcadiaCloud.Deployments
alias ArcadiaCloud.Cloud.CloudResource
alias ArcadiaCloud.Provisioning.SagaState
alias ArcadiaCloud.Repo
@impl true
def name, do: "link_deployment_resource"
@impl true
def execute(state) do
with {:ok, resource_id} <- fetch(state, :cloud_resource_id),
{:ok, deployment} <- fetch_deployment(state) do
{_, _} =
from(r in CloudResource, where: r.id == ^resource_id)
|> Repo.update_all(
set: [
deployment_id: deployment.id,
tenant_id: deployment.tenant_id,
updated_at: DateTime.utc_now() |> DateTime.truncate(:second)
]
)
{:ok, state}
end
end
@impl true
def compensate(state) do
case SagaState.get_output(state, :cloud_resource_id) do
nil ->
:ok
resource_id ->
from(r in CloudResource, where: r.id == ^resource_id)
|> Repo.update_all(set: [deployment_id: nil, tenant_id: nil])
:ok
end
end
defp fetch(state, key) do
case SagaState.get_output(state, key) do
nil -> {:error, {:missing_context, key}}
value -> {:ok, value}
end
end
defp fetch_deployment(state) do
case state.saga && state.saga.deployment_id do
nil ->
{:error, :saga_has_no_deployment}
id ->
case Deployments.get_deployment(id) do
nil -> {:error, :deployment_not_found}
deployment -> {:ok, deployment}
end
end
end
end

View File

@@ -0,0 +1,57 @@
defmodule ArcadiaCloud.Provisioning.Steps.MarkDeploymentProvisioning do
@moduledoc """
First step of the deployment-provisioning choreography.
Forward: a no-op — `Provisioning.provision_deployment/2` already
created the deployment row in the `provisioning` state. Having this
step at index 0 gives the saga a compensation hook that owns the
deployment's failure state: if ANY later step fails, the runner walks
compensation back to here and we move the deployment to `cancelled`.
Without this step a mid-saga failure would leave the deployment stuck
in `provisioning` forever.
"""
@behaviour ArcadiaCloud.Provisioning.Step
require Logger
alias ArcadiaCloud.Deployments
@impl true
def name, do: "mark_deployment_provisioning"
@impl true
def execute(state), do: {:ok, state}
@impl true
def compensate(state) do
case deployment(state) do
nil ->
:ok
deployment ->
case Deployments.transition_state(deployment, "cancelled",
reason: "provision_failed",
actor: "saga:#{state.saga_id}"
) do
{:ok, _} ->
:ok
{:error, reason} ->
Logger.warning(
"[saga #{state.saga_id}] could not cancel deployment on rollback: #{inspect(reason)}"
)
:ok
end
end
end
defp deployment(state) do
case state.saga && state.saga.deployment_id do
nil -> nil
id -> Deployments.get_deployment(id)
end
end
end

View File

@@ -0,0 +1,161 @@
defmodule ArcadiaCloud.Provisioning.Steps.PointDeploymentDns do
@moduledoc """
Points a deployment's hostname at its droplet by upserting an A record.
Reads the droplet's public IPv4 from context (`droplet_public_ip`, set
by WaitDropletActive) — the IP isn't known at saga-start, so this can't
be a generic UpsertDnsRecord with the data baked into inputs.
Saga inputs:
dns_domain — the zone (e.g. "sky-ai.com"); when absent the
step is a no-op (deployment opted out of DNS)
dns_record_name — subdomain; defaults handled by the assembler
dns_record_ttl — optional, defaults 1800
Behaviour mirrors UpsertDnsRecord: create / no-op / update, with the
prior record stashed for compensation.
"""
@behaviour ArcadiaCloud.Provisioning.Step
alias ArcadiaCloud.DigitalOcean.Client
alias ArcadiaCloud.Provisioning.SagaState
@default_ttl 1800
@type_a "A"
@impl true
def name, do: "point_deployment_dns"
@impl true
def execute(state) do
domain = SagaState.get_input(state, :dns_domain)
rname = SagaState.get_input(state, :dns_record_name)
ip = SagaState.get_output(state, :droplet_public_ip)
ttl = SagaState.get_input(state, :dns_record_ttl) || @default_ttl
cond do
is_nil(domain) or domain == "" ->
{:ok, SagaState.put_output(state, :dns_outcome, "skipped")}
is_nil(rname) or rname == "" ->
{:error, :missing_dns_record_name}
is_nil(ip) ->
{:error, :no_droplet_public_ip_in_context}
true ->
upsert(state, domain, rname, ip, ttl)
end
end
@impl true
def compensate(state) do
domain = SagaState.get_input(state, :dns_domain)
outcome = SagaState.get_output(state, :dns_outcome)
record_id = SagaState.get_output(state, :dns_record_id)
case outcome do
"created" -> delete_record(domain, record_id)
"updated" -> restore_record(state, domain, record_id)
_ -> :ok
end
end
# ---- execute helpers ------------------------------------------------------
defp upsert(state, domain, rname, ip, ttl) do
case find_record(domain, rname) do
{:ok, nil} ->
create(state, domain, rname, ip, ttl)
{:ok, existing} ->
if normalize(existing["data"]) == normalize(ip) and existing["ttl"] == ttl do
{:ok, record_outcome(state, existing["id"], "noop", nil)}
else
update(state, domain, existing, rname, ip, ttl)
end
{:error, reason} ->
{:error, reason}
end
end
defp create(state, domain, rname, ip, ttl) do
case Client.create_domain_record(domain, %{type: @type_a, name: rname, data: ip, ttl: ttl}) do
{:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "created", nil)}
{:error, reason} -> {:error, reason}
end
end
defp update(state, domain, existing, rname, ip, ttl) do
prior = %{
"type" => existing["type"],
"name" => existing["name"],
"data" => existing["data"],
"ttl" => existing["ttl"]
}
case Client.update_domain_record(domain, existing["id"], %{
type: @type_a,
name: rname,
data: ip,
ttl: ttl
}) do
{:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "updated", prior)}
{:error, reason} -> {:error, reason}
end
end
defp record_outcome(state, record_id, outcome, prior) do
state
|> SagaState.put_output(:dns_record_id, record_id)
|> SagaState.put_output(:dns_outcome, outcome)
|> SagaState.put_output(:dns_prior, prior)
end
# ---- compensate helpers ---------------------------------------------------
defp delete_record(domain, record_id) do
case Client.delete_domain_record(domain, record_id) do
{:ok, _} -> :ok
{:error, {:http, 404, _}} -> :ok
{:error, reason} -> {:error, reason}
end
end
defp restore_record(state, domain, record_id) do
case SagaState.get_output(state, :dns_prior) do
nil ->
:ok
prior ->
case Client.update_domain_record(domain, record_id, %{
type: prior["type"],
name: prior["name"],
data: prior["data"],
ttl: prior["ttl"]
}) do
{:ok, _} -> :ok
{:error, {:http, 404, _}} -> :ok
{:error, reason} -> {:error, reason}
end
end
end
# ---- shared ---------------------------------------------------------------
defp find_record(domain, rname) do
case Client.list_domain_records(domain) do
{:ok, records} ->
{:ok, Enum.find(records, &(&1["type"] == @type_a and &1["name"] == rname))}
{:error, reason} ->
{:error, reason}
end
end
defp normalize(nil), do: ""
defp normalize(v) when is_binary(v), do: v |> String.trim() |> String.trim_trailing(".")
defp normalize(v), do: to_string(v)
end

View File

@@ -8,7 +8,7 @@ defmodule ArcadiaCloudWeb.DeploymentController do
use ArcadiaCloudWeb, :controller use ArcadiaCloudWeb, :controller
alias ArcadiaCloud.{Catalog, Deployments, Subscriptions} alias ArcadiaCloud.{Catalog, Deployments, Provisioning, Subscriptions}
def index(conn, params) do def index(conn, params) do
identity = conn.assigns.current_identity identity = conn.assigns.current_identity
@@ -30,7 +30,8 @@ defmodule ArcadiaCloudWeb.DeploymentController do
json(conn, %{ json(conn, %{
deployment: shape(deployment), deployment: shape(deployment),
subscription: shape_subscription(sub), subscription: shape_subscription(sub),
events: Enum.map(events, &shape_event/1) events: Enum.map(events, &shape_event/1),
provisioning: shape_provisioning(deployment.id)
}) })
else else
{:halt, conn} -> conn {:halt, conn} -> conn
@@ -52,6 +53,9 @@ defmodule ArcadiaCloudWeb.DeploymentController do
|> Map.put("tenant_id", tenant_id) |> Map.put("tenant_id", tenant_id)
|> Map.put("actor", identity.email) |> Map.put("actor", identity.email)
if truthy(params["provision"]) do
create_and_provision(conn, params, attrs, identity)
else
case Deployments.create_deployment(attrs) do case Deployments.create_deployment(attrs) do
{:ok, deployment} -> {:ok, deployment} ->
conn |> put_status(:created) |> json(%{deployment: shape(deployment)}) conn |> put_status(:created) |> json(%{deployment: shape(deployment)})
@@ -60,6 +64,48 @@ defmodule ArcadiaCloudWeb.DeploymentController do
conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)}) conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)})
end end
end end
end
# Creates the deployment row in `provisioning` and kicks off the
# choreography saga. The deployment only reaches `active` once the
# saga's ActivateDeployment step runs; a saga failure rolls it to
# `cancelled`. Returns 202 — provisioning is asynchronous.
defp create_and_provision(conn, params, attrs, identity) do
size = params["size"]
image = params["image"]
cond do
is_nil(size) or is_nil(image) ->
conn
|> put_status(:unprocessable_entity)
|> json(%{error: "size and image are required to provision"})
true ->
case Deployments.create_deployment(Map.put(attrs, "state", "provisioning")) do
{:ok, deployment} ->
{:ok, saga} =
Provisioning.provision_deployment(deployment,
size: size,
image: image,
region: params["region"] || deployment.region,
dns_domain: params["dns_domain"],
dns_record_name: params["dns_record_name"],
triggered_by: identity.email
)
conn
|> put_status(:accepted)
|> json(%{deployment: shape(deployment), saga_id: saga.id})
{:error, changeset} ->
conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)})
end
end
end
defp truthy(true), do: true
defp truthy("true"), do: true
defp truthy(_), do: false
def transition(conn, %{"id" => id, "to_state" => to_state} = params) do def transition(conn, %{"id" => id, "to_state" => to_state} = params) do
with {:ok, deployment} <- fetch_scoped(conn, id) do with {:ok, deployment} <- fetch_scoped(conn, id) do
@@ -166,6 +212,37 @@ defmodule ArcadiaCloudWeb.DeploymentController do
} }
end end
defp shape_provisioning(deployment_id) do
case Provisioning.list_sagas(deployment_id: deployment_id, limit: 1) do
[saga | _] ->
%{
saga_id: saga.id,
kind: saga.kind,
status: saga.status,
current_step_idx: saga.current_step_idx,
started_at: saga.started_at,
completed_at: saga.completed_at,
error: saga.error,
steps:
saga.id
|> Provisioning.list_step_results()
|> Enum.map(fn r ->
%{
step_idx: r.step_idx,
step_name: r.step_name,
status: r.status,
error: r.error,
started_at: r.started_at,
completed_at: r.completed_at
}
end)
}
[] ->
nil
end
end
defp shape_event(e) do defp shape_event(e) do
%{ %{
from_state: e.from_state, from_state: e.from_state,