Phase 4a: deployment-provisioning choreography saga
Wire a full tenant deployment as one orchestrated, compensating saga: mark → create droplet → wait active → register in inventory → link to deployment → point DNS → activate. A failure anywhere rolls the whole thing back — droplet destroyed, DNS reverted, deployment moved to cancelled. - New lifecycle state `provisioning`; deployments created via the provision path enter here and only reach `active` once the saga's ActivateDeployment step runs. - Four new steps: MarkDeploymentProvisioning (owns the deployment's failure state), LinkDeploymentResource, PointDeploymentDns, ActivateDeployment. - Provisioning.provision_deployment/2 assembles + starts the saga. - DeploymentController: POST /deployments with provision:true creates in `provisioning` and kicks the saga (202); GET /deployments/:id now returns the provisioning saga + per-step progress. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,7 @@ defmodule ArcadiaCloud.Deployments do
|
||||
|
||||
# from_state => [allowed to_states]
|
||||
@transitions %{
|
||||
"provisioning" => ~w(active cancelled),
|
||||
"trial" => ~w(active cancelled),
|
||||
"active" => ~w(paused past_due cancelled),
|
||||
"paused" => ~w(active cancelled),
|
||||
|
||||
@@ -5,7 +5,7 @@ defmodule ArcadiaCloud.Deployments.CloudDeployment do
|
||||
@primary_key {:id, :binary_id, autogenerate: true}
|
||||
@foreign_key_type :binary_id
|
||||
|
||||
@states ~w(trial active past_due paused suspended cancelled archived)
|
||||
@states ~w(provisioning trial active past_due paused suspended cancelled archived)
|
||||
@llm_modes ~w(managed byo none)
|
||||
|
||||
def states, do: @states
|
||||
|
||||
@@ -148,6 +148,54 @@ defmodule ArcadiaCloud.Provisioning do
|
||||
})
|
||||
end
|
||||
|
||||
alias ArcadiaCloud.Provisioning.Steps
|
||||
|
||||
@doc """
|
||||
Assembles + starts the full deployment-provisioning choreography saga
|
||||
for a deployment that was created in the `provisioning` state.
|
||||
|
||||
Steps: mark → create droplet → wait active → register in inventory →
|
||||
link to deployment → point DNS → activate. A failure anywhere rolls
|
||||
the whole thing back (droplet destroyed, DNS reverted, deployment
|
||||
moved to `cancelled`).
|
||||
|
||||
Required opts: :size, :image. Optional: :region (falls back to the
|
||||
deployment's region), :dns_domain, :dns_record_name (falls back to the
|
||||
deployment slug), :triggered_by.
|
||||
"""
|
||||
def provision_deployment(deployment, opts \\ []) do
|
||||
region = opts[:region] || deployment.region
|
||||
|
||||
inputs = %{
|
||||
droplet_name: opts[:droplet_name] || "dep-#{deployment.slug}",
|
||||
droplet_region: region,
|
||||
droplet_size: opts[:size],
|
||||
droplet_image: opts[:image],
|
||||
droplet_tags: [
|
||||
"deployment:#{deployment.id}",
|
||||
"tenant:#{deployment.tenant_id}"
|
||||
],
|
||||
dns_domain: opts[:dns_domain],
|
||||
dns_record_name: opts[:dns_record_name] || deployment.slug
|
||||
}
|
||||
|
||||
start_saga(%{
|
||||
kind: "provision",
|
||||
deployment_id: deployment.id,
|
||||
triggered_by: opts[:triggered_by] || "manual",
|
||||
step_modules: [
|
||||
Steps.MarkDeploymentProvisioning,
|
||||
Steps.CreateDroplet,
|
||||
Steps.WaitDropletActive,
|
||||
Steps.RegisterDroplet,
|
||||
Steps.LinkDeploymentResource,
|
||||
Steps.PointDeploymentDns,
|
||||
Steps.ActivateDeployment
|
||||
],
|
||||
inputs: inputs
|
||||
})
|
||||
end
|
||||
|
||||
def get_saga(id), do: Repo.get(SagaRun, id)
|
||||
def get_saga!(id), do: Repo.get!(SagaRun, id)
|
||||
|
||||
|
||||
48
lib/arcadia_cloud/provisioning/steps/activate_deployment.ex
Normal file
48
lib/arcadia_cloud/provisioning/steps/activate_deployment.ex
Normal file
@@ -0,0 +1,48 @@
|
||||
defmodule ArcadiaCloud.Provisioning.Steps.ActivateDeployment do
|
||||
@moduledoc """
|
||||
Final step of the deployment-provisioning choreography: moves the
|
||||
deployment from `provisioning` to `active` now that its infra is up,
|
||||
registered, linked, and reachable by DNS.
|
||||
|
||||
Idempotent: `transition_state/3` returns `{:ok, deployment}` when the
|
||||
deployment is already `active`.
|
||||
|
||||
No compensation: this is the last step, so the saga only ever rolls
|
||||
back from a step BEFORE this one — meaning this step never ran and the
|
||||
deployment is still `provisioning` (cancelled by
|
||||
MarkDeploymentProvisioning's compensate).
|
||||
"""
|
||||
|
||||
@behaviour ArcadiaCloud.Provisioning.Step
|
||||
|
||||
alias ArcadiaCloud.Deployments
|
||||
|
||||
@impl true
|
||||
def name, do: "activate_deployment"
|
||||
|
||||
@impl true
|
||||
def execute(state) do
|
||||
with {:ok, deployment} <- fetch_deployment(state) do
|
||||
case Deployments.transition_state(deployment, "active",
|
||||
reason: "provisioning_complete",
|
||||
actor: "saga:#{state.saga_id}"
|
||||
) do
|
||||
{:ok, _} -> {:ok, state}
|
||||
{:error, reason} -> {:error, {:activate_failed, reason}}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp fetch_deployment(state) do
|
||||
case state.saga && state.saga.deployment_id do
|
||||
nil ->
|
||||
{:error, :saga_has_no_deployment}
|
||||
|
||||
id ->
|
||||
case Deployments.get_deployment(id) do
|
||||
nil -> {:error, :deployment_not_found}
|
||||
deployment -> {:ok, deployment}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,80 @@
|
||||
defmodule ArcadiaCloud.Provisioning.Steps.LinkDeploymentResource do
|
||||
@moduledoc """
|
||||
Attaches the freshly-registered cloud resource (set by RegisterDroplet
|
||||
as `cloud_resource_id` in context) to the saga's deployment: stamps
|
||||
`deployment_id` and `tenant_id` onto the `cloud_resources` row.
|
||||
|
||||
This is what makes the resource show up under the deployment in
|
||||
inventory and bill against the right tenant.
|
||||
|
||||
Idempotent: re-running just re-writes the same two columns.
|
||||
|
||||
Compensation: clears `deployment_id` and `tenant_id` back to nil. The
|
||||
resource row itself (and the droplet) are undone by RegisterDroplet /
|
||||
CreateDroplet compensation.
|
||||
"""
|
||||
|
||||
@behaviour ArcadiaCloud.Provisioning.Step
|
||||
|
||||
import Ecto.Query
|
||||
|
||||
alias ArcadiaCloud.Deployments
|
||||
alias ArcadiaCloud.Cloud.CloudResource
|
||||
alias ArcadiaCloud.Provisioning.SagaState
|
||||
alias ArcadiaCloud.Repo
|
||||
|
||||
@impl true
|
||||
def name, do: "link_deployment_resource"
|
||||
|
||||
@impl true
|
||||
def execute(state) do
|
||||
with {:ok, resource_id} <- fetch(state, :cloud_resource_id),
|
||||
{:ok, deployment} <- fetch_deployment(state) do
|
||||
{_, _} =
|
||||
from(r in CloudResource, where: r.id == ^resource_id)
|
||||
|> Repo.update_all(
|
||||
set: [
|
||||
deployment_id: deployment.id,
|
||||
tenant_id: deployment.tenant_id,
|
||||
updated_at: DateTime.utc_now() |> DateTime.truncate(:second)
|
||||
]
|
||||
)
|
||||
|
||||
{:ok, state}
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def compensate(state) do
|
||||
case SagaState.get_output(state, :cloud_resource_id) do
|
||||
nil ->
|
||||
:ok
|
||||
|
||||
resource_id ->
|
||||
from(r in CloudResource, where: r.id == ^resource_id)
|
||||
|> Repo.update_all(set: [deployment_id: nil, tenant_id: nil])
|
||||
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
defp fetch(state, key) do
|
||||
case SagaState.get_output(state, key) do
|
||||
nil -> {:error, {:missing_context, key}}
|
||||
value -> {:ok, value}
|
||||
end
|
||||
end
|
||||
|
||||
defp fetch_deployment(state) do
|
||||
case state.saga && state.saga.deployment_id do
|
||||
nil ->
|
||||
{:error, :saga_has_no_deployment}
|
||||
|
||||
id ->
|
||||
case Deployments.get_deployment(id) do
|
||||
nil -> {:error, :deployment_not_found}
|
||||
deployment -> {:ok, deployment}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,57 @@
|
||||
defmodule ArcadiaCloud.Provisioning.Steps.MarkDeploymentProvisioning do
|
||||
@moduledoc """
|
||||
First step of the deployment-provisioning choreography.
|
||||
|
||||
Forward: a no-op — `Provisioning.provision_deployment/2` already
|
||||
created the deployment row in the `provisioning` state. Having this
|
||||
step at index 0 gives the saga a compensation hook that owns the
|
||||
deployment's failure state: if ANY later step fails, the runner walks
|
||||
compensation back to here and we move the deployment to `cancelled`.
|
||||
|
||||
Without this step a mid-saga failure would leave the deployment stuck
|
||||
in `provisioning` forever.
|
||||
"""
|
||||
|
||||
@behaviour ArcadiaCloud.Provisioning.Step
|
||||
|
||||
require Logger
|
||||
|
||||
alias ArcadiaCloud.Deployments
|
||||
|
||||
@impl true
|
||||
def name, do: "mark_deployment_provisioning"
|
||||
|
||||
@impl true
|
||||
def execute(state), do: {:ok, state}
|
||||
|
||||
@impl true
|
||||
def compensate(state) do
|
||||
case deployment(state) do
|
||||
nil ->
|
||||
:ok
|
||||
|
||||
deployment ->
|
||||
case Deployments.transition_state(deployment, "cancelled",
|
||||
reason: "provision_failed",
|
||||
actor: "saga:#{state.saga_id}"
|
||||
) do
|
||||
{:ok, _} ->
|
||||
:ok
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.warning(
|
||||
"[saga #{state.saga_id}] could not cancel deployment on rollback: #{inspect(reason)}"
|
||||
)
|
||||
|
||||
:ok
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp deployment(state) do
|
||||
case state.saga && state.saga.deployment_id do
|
||||
nil -> nil
|
||||
id -> Deployments.get_deployment(id)
|
||||
end
|
||||
end
|
||||
end
|
||||
161
lib/arcadia_cloud/provisioning/steps/point_deployment_dns.ex
Normal file
161
lib/arcadia_cloud/provisioning/steps/point_deployment_dns.ex
Normal file
@@ -0,0 +1,161 @@
|
||||
defmodule ArcadiaCloud.Provisioning.Steps.PointDeploymentDns do
|
||||
@moduledoc """
|
||||
Points a deployment's hostname at its droplet by upserting an A record.
|
||||
|
||||
Reads the droplet's public IPv4 from context (`droplet_public_ip`, set
|
||||
by WaitDropletActive) — the IP isn't known at saga-start, so this can't
|
||||
be a generic UpsertDnsRecord with the data baked into inputs.
|
||||
|
||||
Saga inputs:
|
||||
dns_domain — the zone (e.g. "sky-ai.com"); when absent the
|
||||
step is a no-op (deployment opted out of DNS)
|
||||
dns_record_name — subdomain; defaults handled by the assembler
|
||||
dns_record_ttl — optional, defaults 1800
|
||||
|
||||
Behaviour mirrors UpsertDnsRecord: create / no-op / update, with the
|
||||
prior record stashed for compensation.
|
||||
"""
|
||||
|
||||
@behaviour ArcadiaCloud.Provisioning.Step
|
||||
|
||||
alias ArcadiaCloud.DigitalOcean.Client
|
||||
alias ArcadiaCloud.Provisioning.SagaState
|
||||
|
||||
@default_ttl 1800
|
||||
@type_a "A"
|
||||
|
||||
@impl true
|
||||
def name, do: "point_deployment_dns"
|
||||
|
||||
@impl true
|
||||
def execute(state) do
|
||||
domain = SagaState.get_input(state, :dns_domain)
|
||||
rname = SagaState.get_input(state, :dns_record_name)
|
||||
ip = SagaState.get_output(state, :droplet_public_ip)
|
||||
ttl = SagaState.get_input(state, :dns_record_ttl) || @default_ttl
|
||||
|
||||
cond do
|
||||
is_nil(domain) or domain == "" ->
|
||||
{:ok, SagaState.put_output(state, :dns_outcome, "skipped")}
|
||||
|
||||
is_nil(rname) or rname == "" ->
|
||||
{:error, :missing_dns_record_name}
|
||||
|
||||
is_nil(ip) ->
|
||||
{:error, :no_droplet_public_ip_in_context}
|
||||
|
||||
true ->
|
||||
upsert(state, domain, rname, ip, ttl)
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def compensate(state) do
|
||||
domain = SagaState.get_input(state, :dns_domain)
|
||||
outcome = SagaState.get_output(state, :dns_outcome)
|
||||
record_id = SagaState.get_output(state, :dns_record_id)
|
||||
|
||||
case outcome do
|
||||
"created" -> delete_record(domain, record_id)
|
||||
"updated" -> restore_record(state, domain, record_id)
|
||||
_ -> :ok
|
||||
end
|
||||
end
|
||||
|
||||
# ---- execute helpers ------------------------------------------------------
|
||||
|
||||
defp upsert(state, domain, rname, ip, ttl) do
|
||||
case find_record(domain, rname) do
|
||||
{:ok, nil} ->
|
||||
create(state, domain, rname, ip, ttl)
|
||||
|
||||
{:ok, existing} ->
|
||||
if normalize(existing["data"]) == normalize(ip) and existing["ttl"] == ttl do
|
||||
{:ok, record_outcome(state, existing["id"], "noop", nil)}
|
||||
else
|
||||
update(state, domain, existing, rname, ip, ttl)
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp create(state, domain, rname, ip, ttl) do
|
||||
case Client.create_domain_record(domain, %{type: @type_a, name: rname, data: ip, ttl: ttl}) do
|
||||
{:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "created", nil)}
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp update(state, domain, existing, rname, ip, ttl) do
|
||||
prior = %{
|
||||
"type" => existing["type"],
|
||||
"name" => existing["name"],
|
||||
"data" => existing["data"],
|
||||
"ttl" => existing["ttl"]
|
||||
}
|
||||
|
||||
case Client.update_domain_record(domain, existing["id"], %{
|
||||
type: @type_a,
|
||||
name: rname,
|
||||
data: ip,
|
||||
ttl: ttl
|
||||
}) do
|
||||
{:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "updated", prior)}
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp record_outcome(state, record_id, outcome, prior) do
|
||||
state
|
||||
|> SagaState.put_output(:dns_record_id, record_id)
|
||||
|> SagaState.put_output(:dns_outcome, outcome)
|
||||
|> SagaState.put_output(:dns_prior, prior)
|
||||
end
|
||||
|
||||
# ---- compensate helpers ---------------------------------------------------
|
||||
|
||||
defp delete_record(domain, record_id) do
|
||||
case Client.delete_domain_record(domain, record_id) do
|
||||
{:ok, _} -> :ok
|
||||
{:error, {:http, 404, _}} -> :ok
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp restore_record(state, domain, record_id) do
|
||||
case SagaState.get_output(state, :dns_prior) do
|
||||
nil ->
|
||||
:ok
|
||||
|
||||
prior ->
|
||||
case Client.update_domain_record(domain, record_id, %{
|
||||
type: prior["type"],
|
||||
name: prior["name"],
|
||||
data: prior["data"],
|
||||
ttl: prior["ttl"]
|
||||
}) do
|
||||
{:ok, _} -> :ok
|
||||
{:error, {:http, 404, _}} -> :ok
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# ---- shared ---------------------------------------------------------------
|
||||
|
||||
defp find_record(domain, rname) do
|
||||
case Client.list_domain_records(domain) do
|
||||
{:ok, records} ->
|
||||
{:ok, Enum.find(records, &(&1["type"] == @type_a and &1["name"] == rname))}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp normalize(nil), do: ""
|
||||
defp normalize(v) when is_binary(v), do: v |> String.trim() |> String.trim_trailing(".")
|
||||
defp normalize(v), do: to_string(v)
|
||||
end
|
||||
Reference in New Issue
Block a user