diff --git a/lib/arcadia_cloud/deployments.ex b/lib/arcadia_cloud/deployments.ex index 0bf5bce..7cab6a0 100644 --- a/lib/arcadia_cloud/deployments.ex +++ b/lib/arcadia_cloud/deployments.ex @@ -24,6 +24,7 @@ defmodule ArcadiaCloud.Deployments do # from_state => [allowed to_states] @transitions %{ + "provisioning" => ~w(active cancelled), "trial" => ~w(active cancelled), "active" => ~w(paused past_due cancelled), "paused" => ~w(active cancelled), diff --git a/lib/arcadia_cloud/deployments/cloud_deployment.ex b/lib/arcadia_cloud/deployments/cloud_deployment.ex index b02da5f..22c1c18 100644 --- a/lib/arcadia_cloud/deployments/cloud_deployment.ex +++ b/lib/arcadia_cloud/deployments/cloud_deployment.ex @@ -5,7 +5,7 @@ defmodule ArcadiaCloud.Deployments.CloudDeployment do @primary_key {:id, :binary_id, autogenerate: true} @foreign_key_type :binary_id - @states ~w(trial active past_due paused suspended cancelled archived) + @states ~w(provisioning trial active past_due paused suspended cancelled archived) @llm_modes ~w(managed byo none) def states, do: @states diff --git a/lib/arcadia_cloud/provisioning.ex b/lib/arcadia_cloud/provisioning.ex index 9d31b8a..553a83f 100644 --- a/lib/arcadia_cloud/provisioning.ex +++ b/lib/arcadia_cloud/provisioning.ex @@ -148,6 +148,54 @@ defmodule ArcadiaCloud.Provisioning do }) end + alias ArcadiaCloud.Provisioning.Steps + + @doc """ + Assembles + starts the full deployment-provisioning choreography saga + for a deployment that was created in the `provisioning` state. + + Steps: mark → create droplet → wait active → register in inventory → + link to deployment → point DNS → activate. A failure anywhere rolls + the whole thing back (droplet destroyed, DNS reverted, deployment + moved to `cancelled`). + + Required opts: :size, :image. Optional: :region (falls back to the + deployment's region), :dns_domain, :dns_record_name (falls back to the + deployment slug), :triggered_by. + """ + def provision_deployment(deployment, opts \\ []) do + region = opts[:region] || deployment.region + + inputs = %{ + droplet_name: opts[:droplet_name] || "dep-#{deployment.slug}", + droplet_region: region, + droplet_size: opts[:size], + droplet_image: opts[:image], + droplet_tags: [ + "deployment:#{deployment.id}", + "tenant:#{deployment.tenant_id}" + ], + dns_domain: opts[:dns_domain], + dns_record_name: opts[:dns_record_name] || deployment.slug + } + + start_saga(%{ + kind: "provision", + deployment_id: deployment.id, + triggered_by: opts[:triggered_by] || "manual", + step_modules: [ + Steps.MarkDeploymentProvisioning, + Steps.CreateDroplet, + Steps.WaitDropletActive, + Steps.RegisterDroplet, + Steps.LinkDeploymentResource, + Steps.PointDeploymentDns, + Steps.ActivateDeployment + ], + inputs: inputs + }) + end + def get_saga(id), do: Repo.get(SagaRun, id) def get_saga!(id), do: Repo.get!(SagaRun, id) diff --git a/lib/arcadia_cloud/provisioning/steps/activate_deployment.ex b/lib/arcadia_cloud/provisioning/steps/activate_deployment.ex new file mode 100644 index 0000000..6f0766d --- /dev/null +++ b/lib/arcadia_cloud/provisioning/steps/activate_deployment.ex @@ -0,0 +1,48 @@ +defmodule ArcadiaCloud.Provisioning.Steps.ActivateDeployment do + @moduledoc """ + Final step of the deployment-provisioning choreography: moves the + deployment from `provisioning` to `active` now that its infra is up, + registered, linked, and reachable by DNS. + + Idempotent: `transition_state/3` returns `{:ok, deployment}` when the + deployment is already `active`. + + No compensation: this is the last step, so the saga only ever rolls + back from a step BEFORE this one — meaning this step never ran and the + deployment is still `provisioning` (cancelled by + MarkDeploymentProvisioning's compensate). + """ + + @behaviour ArcadiaCloud.Provisioning.Step + + alias ArcadiaCloud.Deployments + + @impl true + def name, do: "activate_deployment" + + @impl true + def execute(state) do + with {:ok, deployment} <- fetch_deployment(state) do + case Deployments.transition_state(deployment, "active", + reason: "provisioning_complete", + actor: "saga:#{state.saga_id}" + ) do + {:ok, _} -> {:ok, state} + {:error, reason} -> {:error, {:activate_failed, reason}} + end + end + end + + defp fetch_deployment(state) do + case state.saga && state.saga.deployment_id do + nil -> + {:error, :saga_has_no_deployment} + + id -> + case Deployments.get_deployment(id) do + nil -> {:error, :deployment_not_found} + deployment -> {:ok, deployment} + end + end + end +end diff --git a/lib/arcadia_cloud/provisioning/steps/link_deployment_resource.ex b/lib/arcadia_cloud/provisioning/steps/link_deployment_resource.ex new file mode 100644 index 0000000..a418dc7 --- /dev/null +++ b/lib/arcadia_cloud/provisioning/steps/link_deployment_resource.ex @@ -0,0 +1,80 @@ +defmodule ArcadiaCloud.Provisioning.Steps.LinkDeploymentResource do + @moduledoc """ + Attaches the freshly-registered cloud resource (set by RegisterDroplet + as `cloud_resource_id` in context) to the saga's deployment: stamps + `deployment_id` and `tenant_id` onto the `cloud_resources` row. + + This is what makes the resource show up under the deployment in + inventory and bill against the right tenant. + + Idempotent: re-running just re-writes the same two columns. + + Compensation: clears `deployment_id` and `tenant_id` back to nil. The + resource row itself (and the droplet) are undone by RegisterDroplet / + CreateDroplet compensation. + """ + + @behaviour ArcadiaCloud.Provisioning.Step + + import Ecto.Query + + alias ArcadiaCloud.Deployments + alias ArcadiaCloud.Cloud.CloudResource + alias ArcadiaCloud.Provisioning.SagaState + alias ArcadiaCloud.Repo + + @impl true + def name, do: "link_deployment_resource" + + @impl true + def execute(state) do + with {:ok, resource_id} <- fetch(state, :cloud_resource_id), + {:ok, deployment} <- fetch_deployment(state) do + {_, _} = + from(r in CloudResource, where: r.id == ^resource_id) + |> Repo.update_all( + set: [ + deployment_id: deployment.id, + tenant_id: deployment.tenant_id, + updated_at: DateTime.utc_now() |> DateTime.truncate(:second) + ] + ) + + {:ok, state} + end + end + + @impl true + def compensate(state) do + case SagaState.get_output(state, :cloud_resource_id) do + nil -> + :ok + + resource_id -> + from(r in CloudResource, where: r.id == ^resource_id) + |> Repo.update_all(set: [deployment_id: nil, tenant_id: nil]) + + :ok + end + end + + defp fetch(state, key) do + case SagaState.get_output(state, key) do + nil -> {:error, {:missing_context, key}} + value -> {:ok, value} + end + end + + defp fetch_deployment(state) do + case state.saga && state.saga.deployment_id do + nil -> + {:error, :saga_has_no_deployment} + + id -> + case Deployments.get_deployment(id) do + nil -> {:error, :deployment_not_found} + deployment -> {:ok, deployment} + end + end + end +end diff --git a/lib/arcadia_cloud/provisioning/steps/mark_deployment_provisioning.ex b/lib/arcadia_cloud/provisioning/steps/mark_deployment_provisioning.ex new file mode 100644 index 0000000..ef74793 --- /dev/null +++ b/lib/arcadia_cloud/provisioning/steps/mark_deployment_provisioning.ex @@ -0,0 +1,57 @@ +defmodule ArcadiaCloud.Provisioning.Steps.MarkDeploymentProvisioning do + @moduledoc """ + First step of the deployment-provisioning choreography. + + Forward: a no-op — `Provisioning.provision_deployment/2` already + created the deployment row in the `provisioning` state. Having this + step at index 0 gives the saga a compensation hook that owns the + deployment's failure state: if ANY later step fails, the runner walks + compensation back to here and we move the deployment to `cancelled`. + + Without this step a mid-saga failure would leave the deployment stuck + in `provisioning` forever. + """ + + @behaviour ArcadiaCloud.Provisioning.Step + + require Logger + + alias ArcadiaCloud.Deployments + + @impl true + def name, do: "mark_deployment_provisioning" + + @impl true + def execute(state), do: {:ok, state} + + @impl true + def compensate(state) do + case deployment(state) do + nil -> + :ok + + deployment -> + case Deployments.transition_state(deployment, "cancelled", + reason: "provision_failed", + actor: "saga:#{state.saga_id}" + ) do + {:ok, _} -> + :ok + + {:error, reason} -> + Logger.warning( + "[saga #{state.saga_id}] could not cancel deployment on rollback: #{inspect(reason)}" + ) + + :ok + end + end + end + + defp deployment(state) do + case state.saga && state.saga.deployment_id do + nil -> nil + id -> Deployments.get_deployment(id) + end + end +end diff --git a/lib/arcadia_cloud/provisioning/steps/point_deployment_dns.ex b/lib/arcadia_cloud/provisioning/steps/point_deployment_dns.ex new file mode 100644 index 0000000..1988fd6 --- /dev/null +++ b/lib/arcadia_cloud/provisioning/steps/point_deployment_dns.ex @@ -0,0 +1,161 @@ +defmodule ArcadiaCloud.Provisioning.Steps.PointDeploymentDns do + @moduledoc """ + Points a deployment's hostname at its droplet by upserting an A record. + + Reads the droplet's public IPv4 from context (`droplet_public_ip`, set + by WaitDropletActive) — the IP isn't known at saga-start, so this can't + be a generic UpsertDnsRecord with the data baked into inputs. + + Saga inputs: + dns_domain — the zone (e.g. "sky-ai.com"); when absent the + step is a no-op (deployment opted out of DNS) + dns_record_name — subdomain; defaults handled by the assembler + dns_record_ttl — optional, defaults 1800 + + Behaviour mirrors UpsertDnsRecord: create / no-op / update, with the + prior record stashed for compensation. + """ + + @behaviour ArcadiaCloud.Provisioning.Step + + alias ArcadiaCloud.DigitalOcean.Client + alias ArcadiaCloud.Provisioning.SagaState + + @default_ttl 1800 + @type_a "A" + + @impl true + def name, do: "point_deployment_dns" + + @impl true + def execute(state) do + domain = SagaState.get_input(state, :dns_domain) + rname = SagaState.get_input(state, :dns_record_name) + ip = SagaState.get_output(state, :droplet_public_ip) + ttl = SagaState.get_input(state, :dns_record_ttl) || @default_ttl + + cond do + is_nil(domain) or domain == "" -> + {:ok, SagaState.put_output(state, :dns_outcome, "skipped")} + + is_nil(rname) or rname == "" -> + {:error, :missing_dns_record_name} + + is_nil(ip) -> + {:error, :no_droplet_public_ip_in_context} + + true -> + upsert(state, domain, rname, ip, ttl) + end + end + + @impl true + def compensate(state) do + domain = SagaState.get_input(state, :dns_domain) + outcome = SagaState.get_output(state, :dns_outcome) + record_id = SagaState.get_output(state, :dns_record_id) + + case outcome do + "created" -> delete_record(domain, record_id) + "updated" -> restore_record(state, domain, record_id) + _ -> :ok + end + end + + # ---- execute helpers ------------------------------------------------------ + + defp upsert(state, domain, rname, ip, ttl) do + case find_record(domain, rname) do + {:ok, nil} -> + create(state, domain, rname, ip, ttl) + + {:ok, existing} -> + if normalize(existing["data"]) == normalize(ip) and existing["ttl"] == ttl do + {:ok, record_outcome(state, existing["id"], "noop", nil)} + else + update(state, domain, existing, rname, ip, ttl) + end + + {:error, reason} -> + {:error, reason} + end + end + + defp create(state, domain, rname, ip, ttl) do + case Client.create_domain_record(domain, %{type: @type_a, name: rname, data: ip, ttl: ttl}) do + {:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "created", nil)} + {:error, reason} -> {:error, reason} + end + end + + defp update(state, domain, existing, rname, ip, ttl) do + prior = %{ + "type" => existing["type"], + "name" => existing["name"], + "data" => existing["data"], + "ttl" => existing["ttl"] + } + + case Client.update_domain_record(domain, existing["id"], %{ + type: @type_a, + name: rname, + data: ip, + ttl: ttl + }) do + {:ok, %{"id" => id}} -> {:ok, record_outcome(state, id, "updated", prior)} + {:error, reason} -> {:error, reason} + end + end + + defp record_outcome(state, record_id, outcome, prior) do + state + |> SagaState.put_output(:dns_record_id, record_id) + |> SagaState.put_output(:dns_outcome, outcome) + |> SagaState.put_output(:dns_prior, prior) + end + + # ---- compensate helpers --------------------------------------------------- + + defp delete_record(domain, record_id) do + case Client.delete_domain_record(domain, record_id) do + {:ok, _} -> :ok + {:error, {:http, 404, _}} -> :ok + {:error, reason} -> {:error, reason} + end + end + + defp restore_record(state, domain, record_id) do + case SagaState.get_output(state, :dns_prior) do + nil -> + :ok + + prior -> + case Client.update_domain_record(domain, record_id, %{ + type: prior["type"], + name: prior["name"], + data: prior["data"], + ttl: prior["ttl"] + }) do + {:ok, _} -> :ok + {:error, {:http, 404, _}} -> :ok + {:error, reason} -> {:error, reason} + end + end + end + + # ---- shared --------------------------------------------------------------- + + defp find_record(domain, rname) do + case Client.list_domain_records(domain) do + {:ok, records} -> + {:ok, Enum.find(records, &(&1["type"] == @type_a and &1["name"] == rname))} + + {:error, reason} -> + {:error, reason} + end + end + + defp normalize(nil), do: "" + defp normalize(v) when is_binary(v), do: v |> String.trim() |> String.trim_trailing(".") + defp normalize(v), do: to_string(v) +end diff --git a/lib/arcadia_cloud_web/controllers/deployment_controller.ex b/lib/arcadia_cloud_web/controllers/deployment_controller.ex index 6d3dda7..3b4a7b1 100644 --- a/lib/arcadia_cloud_web/controllers/deployment_controller.ex +++ b/lib/arcadia_cloud_web/controllers/deployment_controller.ex @@ -8,7 +8,7 @@ defmodule ArcadiaCloudWeb.DeploymentController do use ArcadiaCloudWeb, :controller - alias ArcadiaCloud.{Catalog, Deployments, Subscriptions} + alias ArcadiaCloud.{Catalog, Deployments, Provisioning, Subscriptions} def index(conn, params) do identity = conn.assigns.current_identity @@ -30,7 +30,8 @@ defmodule ArcadiaCloudWeb.DeploymentController do json(conn, %{ deployment: shape(deployment), subscription: shape_subscription(sub), - events: Enum.map(events, &shape_event/1) + events: Enum.map(events, &shape_event/1), + provisioning: shape_provisioning(deployment.id) }) else {:halt, conn} -> conn @@ -52,15 +53,60 @@ defmodule ArcadiaCloudWeb.DeploymentController do |> Map.put("tenant_id", tenant_id) |> Map.put("actor", identity.email) - case Deployments.create_deployment(attrs) do - {:ok, deployment} -> - conn |> put_status(:created) |> json(%{deployment: shape(deployment)}) + if truthy(params["provision"]) do + create_and_provision(conn, params, attrs, identity) + else + case Deployments.create_deployment(attrs) do + {:ok, deployment} -> + conn |> put_status(:created) |> json(%{deployment: shape(deployment)}) - {:error, changeset} -> - conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)}) + {:error, changeset} -> + conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)}) + end end end + # Creates the deployment row in `provisioning` and kicks off the + # choreography saga. The deployment only reaches `active` once the + # saga's ActivateDeployment step runs; a saga failure rolls it to + # `cancelled`. Returns 202 — provisioning is asynchronous. + defp create_and_provision(conn, params, attrs, identity) do + size = params["size"] + image = params["image"] + + cond do + is_nil(size) or is_nil(image) -> + conn + |> put_status(:unprocessable_entity) + |> json(%{error: "size and image are required to provision"}) + + true -> + case Deployments.create_deployment(Map.put(attrs, "state", "provisioning")) do + {:ok, deployment} -> + {:ok, saga} = + Provisioning.provision_deployment(deployment, + size: size, + image: image, + region: params["region"] || deployment.region, + dns_domain: params["dns_domain"], + dns_record_name: params["dns_record_name"], + triggered_by: identity.email + ) + + conn + |> put_status(:accepted) + |> json(%{deployment: shape(deployment), saga_id: saga.id}) + + {:error, changeset} -> + conn |> put_status(:unprocessable_entity) |> json(%{error: errors(changeset)}) + end + end + end + + defp truthy(true), do: true + defp truthy("true"), do: true + defp truthy(_), do: false + def transition(conn, %{"id" => id, "to_state" => to_state} = params) do with {:ok, deployment} <- fetch_scoped(conn, id) do case Deployments.transition_state(deployment, to_state, @@ -166,6 +212,37 @@ defmodule ArcadiaCloudWeb.DeploymentController do } end + defp shape_provisioning(deployment_id) do + case Provisioning.list_sagas(deployment_id: deployment_id, limit: 1) do + [saga | _] -> + %{ + saga_id: saga.id, + kind: saga.kind, + status: saga.status, + current_step_idx: saga.current_step_idx, + started_at: saga.started_at, + completed_at: saga.completed_at, + error: saga.error, + steps: + saga.id + |> Provisioning.list_step_results() + |> Enum.map(fn r -> + %{ + step_idx: r.step_idx, + step_name: r.step_name, + status: r.status, + error: r.error, + started_at: r.started_at, + completed_at: r.completed_at + } + end) + } + + [] -> + nil + end + end + defp shape_event(e) do %{ from_state: e.from_state,