From b1a124f0443e5830b058901f6bfc30cf5a93a9cd Mon Sep 17 00:00:00 2001 From: Giuliano Silvestro Date: Wed, 20 May 2026 08:38:35 +1000 Subject: [PATCH] =?UTF-8?q?Phase=202:=20first=20real=20DO=20write=20step?= =?UTF-8?q?=20=E2=80=94=20CreateDropletSnapshot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DigitalOcean.Client write methods: - create_droplet_snapshot/3 — POST a snapshot action (async) - get_droplet_action/3 — poll action status - list_droplet_snapshots/2 — snapshots for a droplet - delete_snapshot/2 — DELETE (used by compensation) All use the "provisioning" token purpose. Steps.CreateDropletSnapshot — the first saga step that touches real infra: - execute: deterministic snapshot name (arcadia-snap--); checks context for a prior snapshot_id, then checks DO for a snapshot already carrying that name (crash-between-post-and-save recovery), then posts the action, polls to completion, finds the resulting snapshot, records snapshot_id + snapshot_name in context. - compensate: deletes the snapshot; treats HTTP 404 as success. Provisioning.snapshot_droplet/2 — convenience saga starter. Two DO eventual-consistency gotchas surfaced + handled: - After a snapshot action reports "completed", the snapshot lags a few seconds before appearing in /droplets/:id/snapshots. The step now retries the lookup (find_snapshot_with_retry, 12x5s) instead of failing with :snapshot_not_found_after_completion. - Deletion has the same lag the other way — a deleted snapshot lingers in the listing briefly. compensate just trusts the DELETE 2xx/404; no post-delete verification needed. Live smoke verified end-to-end against holyspiritbraypark.com: [CreateDropletSnapshot, Fail] saga — the step created real snapshot 229305609, the Fail step triggered compensation, compensation deleted the snapshot. Final: saga rolled_back, ledger [create_droplet_snapshot: compensated, fail: failed], zero leftover on DO. Test-harness note: smoke tests create sagas via Provisioning.create_saga (no Oban enqueue) so a single manual Runner.perform/1 owns execution — start_saga/1 enqueues an Oban job, and running both racing the same saga corrupts the step ledger. Production only ever runs via Oban. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/arcadia_cloud/digital_ocean/client.ex | 34 ++++ lib/arcadia_cloud/provisioning.ex | 17 ++ .../steps/create_droplet_snapshot.ex | 163 ++++++++++++++++++ 3 files changed, 214 insertions(+) create mode 100644 lib/arcadia_cloud/provisioning/steps/create_droplet_snapshot.ex diff --git a/lib/arcadia_cloud/digital_ocean/client.ex b/lib/arcadia_cloud/digital_ocean/client.ex index 103fb98..a1a90af 100644 --- a/lib/arcadia_cloud/digital_ocean/client.ex +++ b/lib/arcadia_cloud/digital_ocean/client.ex @@ -28,6 +28,40 @@ defmodule ArcadiaCloud.DigitalOcean.Client do list_paginated("/droplets/#{droplet_id}/backups", "backups", opts) end + def list_droplet_snapshots(droplet_id, opts \\ []) do + list_paginated("/droplets/#{droplet_id}/snapshots", "snapshots", opts) + end + + # ---- write actions -------------------------------------------------------- + + @doc """ + Request a snapshot of a droplet. Returns {:ok, action} — the snapshot + is created asynchronously; poll `get_droplet_action/3` until the action + status is "completed". + """ + def create_droplet_snapshot(droplet_id, snapshot_name, opts \\ []) do + case request(:post, "/droplets/#{droplet_id}/actions", + body: %{type: "snapshot", name: snapshot_name}, + purpose: opts[:purpose] || "provisioning" + ) do + {:ok, %{"action" => action}} -> {:ok, action} + other -> other + end + end + + def get_droplet_action(droplet_id, action_id, opts \\ []) do + case request(:get, "/droplets/#{droplet_id}/actions/#{action_id}", + purpose: opts[:purpose] || "provisioning" + ) do + {:ok, %{"action" => action}} -> {:ok, action} + other -> other + end + end + + def delete_snapshot(snapshot_id, opts \\ []) do + request(:delete, "/snapshots/#{snapshot_id}", purpose: opts[:purpose] || "provisioning") + end + # ---- billing -------------------------------------------------------------- def get_balance(opts \\ []) do diff --git a/lib/arcadia_cloud/provisioning.ex b/lib/arcadia_cloud/provisioning.ex index 86c14db..7455990 100644 --- a/lib/arcadia_cloud/provisioning.ex +++ b/lib/arcadia_cloud/provisioning.ex @@ -56,6 +56,23 @@ defmodule ArcadiaCloud.Provisioning do |> Repo.insert() end + @doc """ + Starts a snapshot saga for a droplet. `droplet_provider_id` is the DO + numeric droplet id (string). Optional `:snapshot_label` and + `:triggered_by`. + """ + def snapshot_droplet(droplet_provider_id, opts \\ []) do + start_saga(%{ + kind: "provision", + step_modules: [ArcadiaCloud.Provisioning.Steps.CreateDropletSnapshot], + inputs: %{ + droplet_provider_id: to_string(droplet_provider_id), + snapshot_label: opts[:snapshot_label] + }, + triggered_by: opts[:triggered_by] || "manual" + }) + end + def get_saga(id), do: Repo.get(SagaRun, id) def get_saga!(id), do: Repo.get!(SagaRun, id) diff --git a/lib/arcadia_cloud/provisioning/steps/create_droplet_snapshot.ex b/lib/arcadia_cloud/provisioning/steps/create_droplet_snapshot.ex new file mode 100644 index 0000000..d39d0c0 --- /dev/null +++ b/lib/arcadia_cloud/provisioning/steps/create_droplet_snapshot.ex @@ -0,0 +1,163 @@ +defmodule ArcadiaCloud.Provisioning.Steps.CreateDropletSnapshot do + @moduledoc """ + Creates a DO snapshot of a droplet. + + Saga inputs: + droplet_provider_id — required; the DO numeric droplet id (as string) + snapshot_label — optional; appended to the deterministic name + + Idempotency: the snapshot name is deterministic + (`arcadia-snap--`). On re-run the step first checks + context for `snapshot_id`, then checks DO for a snapshot already + carrying that name — so a crash between "action posted" and "context + saved" doesn't create a second snapshot. + + Compensation: deletes the snapshot if one was created. + """ + + @behaviour ArcadiaCloud.Provisioning.Step + + require Logger + + alias ArcadiaCloud.DigitalOcean.Client + alias ArcadiaCloud.Provisioning.SagaState + + @poll_interval_ms 5_000 + @poll_max_attempts 72 + # After the snapshot action completes, DO's /droplets/:id/snapshots + # listing lags a few seconds before the new snapshot appears. + @find_retry_attempts 12 + @find_retry_interval_ms 5_000 + + @impl true + def name, do: "create_droplet_snapshot" + + @impl true + def execute(state) do + droplet_id = SagaState.get_input(state, :droplet_provider_id) + snapshot_name = snapshot_name(state, droplet_id) + + cond do + is_nil(droplet_id) -> + {:error, :missing_droplet_provider_id} + + SagaState.get_output(state, :snapshot_id) -> + # Already done in a prior attempt. + {:ok, state} + + true -> + do_create(state, droplet_id, snapshot_name) + end + end + + @impl true + def compensate(state) do + case SagaState.get_output(state, :snapshot_id) do + nil -> + :ok + + snapshot_id -> + case Client.delete_snapshot(snapshot_id) do + {:ok, _} -> :ok + {:error, {:http, 404, _}} -> :ok + {:error, reason} -> {:error, reason} + end + end + end + + # ---- internals ------------------------------------------------------------ + + defp do_create(state, droplet_id, snapshot_name) do + # If a snapshot with our deterministic name already exists, adopt it + # rather than create a duplicate (crash-between-post-and-save recovery). + case find_snapshot_by_name(droplet_id, snapshot_name) do + {:ok, %{"id" => id}} -> + {:ok, record(state, id, snapshot_name)} + + :not_found -> + with {:ok, action} <- Client.create_droplet_snapshot(droplet_id, snapshot_name), + {:ok, _completed} <- poll_action(droplet_id, action["id"]), + {:ok, %{"id" => id}} <- find_snapshot_with_retry(droplet_id, snapshot_name) do + {:ok, record(state, id, snapshot_name)} + else + :not_found -> {:error, :snapshot_not_found_after_completion} + {:error, reason} -> {:error, reason} + end + + {:error, reason} -> + {:error, reason} + end + end + + defp record(state, snapshot_id, snapshot_name) do + state + |> SagaState.put_output(:snapshot_id, snapshot_id) + |> SagaState.put_output(:snapshot_name, snapshot_name) + end + + defp snapshot_name(state, droplet_id) do + label = SagaState.get_input(state, :snapshot_label) + saga8 = state.saga_id |> to_string() |> String.slice(0, 8) + base = "arcadia-snap-#{droplet_id}-#{saga8}" + if label, do: "#{base}-#{label}", else: base + end + + # Retry the lookup — DO reports the action "completed" a few seconds + # before the snapshot is listable. + defp find_snapshot_with_retry(droplet_id, name, attempt \\ 1) + + defp find_snapshot_with_retry(_droplet_id, _name, attempt) + when attempt > @find_retry_attempts do + {:error, :snapshot_not_found_after_completion} + end + + defp find_snapshot_with_retry(droplet_id, name, attempt) do + case find_snapshot_by_name(droplet_id, name) do + {:ok, snap} -> + {:ok, snap} + + :not_found -> + Process.sleep(@find_retry_interval_ms) + find_snapshot_with_retry(droplet_id, name, attempt + 1) + + {:error, reason} -> + {:error, reason} + end + end + + defp find_snapshot_by_name(droplet_id, name) do + case Client.list_droplet_snapshots(droplet_id) do + {:ok, snapshots} -> + case Enum.find(snapshots, &(&1["name"] == name)) do + nil -> :not_found + snap -> {:ok, snap} + end + + {:error, reason} -> + {:error, reason} + end + end + + defp poll_action(droplet_id, action_id, attempt \\ 1) + + defp poll_action(_droplet_id, _action_id, attempt) when attempt > @poll_max_attempts do + {:error, :snapshot_action_timeout} + end + + defp poll_action(droplet_id, action_id, attempt) do + case Client.get_droplet_action(droplet_id, action_id) do + {:ok, %{"status" => "completed"}} -> + {:ok, :completed} + + {:ok, %{"status" => "errored"}} -> + {:error, :snapshot_action_errored} + + {:ok, %{"status" => "in-progress"}} -> + Process.sleep(@poll_interval_ms) + poll_action(droplet_id, action_id, attempt + 1) + + {:error, reason} -> + {:error, reason} + end + end +end