defmodule ArcadiaCloud.Provisioning.Steps.CreateDropletSnapshot do @moduledoc """ Creates a DO snapshot of a droplet. Saga inputs: droplet_provider_id — required; the DO numeric droplet id (as string) snapshot_label — optional; appended to the deterministic name Idempotency: the snapshot name is deterministic (`arcadia-snap--`). On re-run the step first checks context for `snapshot_id`, then checks DO for a snapshot already carrying that name — so a crash between "action posted" and "context saved" doesn't create a second snapshot. Compensation: deletes the snapshot if one was created. """ @behaviour ArcadiaCloud.Provisioning.Step require Logger alias ArcadiaCloud.DigitalOcean.Client alias ArcadiaCloud.Provisioning.SagaState @poll_interval_ms 5_000 @poll_max_attempts 72 # After the snapshot action completes, DO's /droplets/:id/snapshots # listing lags a few seconds before the new snapshot appears. @find_retry_attempts 12 @find_retry_interval_ms 5_000 @impl true def name, do: "create_droplet_snapshot" @impl true def execute(state) do droplet_id = SagaState.get_input(state, :droplet_provider_id) snapshot_name = snapshot_name(state, droplet_id) cond do is_nil(droplet_id) -> {:error, :missing_droplet_provider_id} SagaState.get_output(state, :snapshot_id) -> # Already done in a prior attempt. {:ok, state} true -> do_create(state, droplet_id, snapshot_name) end end @impl true def compensate(state) do case SagaState.get_output(state, :snapshot_id) do nil -> :ok snapshot_id -> case Client.delete_snapshot(snapshot_id) do {:ok, _} -> :ok {:error, {:http, 404, _}} -> :ok {:error, reason} -> {:error, reason} end end end # ---- internals ------------------------------------------------------------ defp do_create(state, droplet_id, snapshot_name) do # If a snapshot with our deterministic name already exists, adopt it # rather than create a duplicate (crash-between-post-and-save recovery). case find_snapshot_by_name(droplet_id, snapshot_name) do {:ok, %{"id" => id}} -> {:ok, record(state, id, snapshot_name)} :not_found -> with {:ok, action} <- Client.create_droplet_snapshot(droplet_id, snapshot_name), {:ok, _completed} <- poll_action(droplet_id, action["id"]), {:ok, %{"id" => id}} <- find_snapshot_with_retry(droplet_id, snapshot_name) do {:ok, record(state, id, snapshot_name)} else :not_found -> {:error, :snapshot_not_found_after_completion} {:error, reason} -> {:error, reason} end {:error, reason} -> {:error, reason} end end defp record(state, snapshot_id, snapshot_name) do state |> SagaState.put_output(:snapshot_id, snapshot_id) |> SagaState.put_output(:snapshot_name, snapshot_name) end defp snapshot_name(state, droplet_id) do label = SagaState.get_input(state, :snapshot_label) saga8 = state.saga_id |> to_string() |> String.slice(0, 8) base = "arcadia-snap-#{droplet_id}-#{saga8}" if label, do: "#{base}-#{label}", else: base end # Retry the lookup — DO reports the action "completed" a few seconds # before the snapshot is listable. defp find_snapshot_with_retry(droplet_id, name, attempt \\ 1) defp find_snapshot_with_retry(_droplet_id, _name, attempt) when attempt > @find_retry_attempts do {:error, :snapshot_not_found_after_completion} end defp find_snapshot_with_retry(droplet_id, name, attempt) do case find_snapshot_by_name(droplet_id, name) do {:ok, snap} -> {:ok, snap} :not_found -> Process.sleep(@find_retry_interval_ms) find_snapshot_with_retry(droplet_id, name, attempt + 1) {:error, reason} -> {:error, reason} end end defp find_snapshot_by_name(droplet_id, name) do case Client.list_droplet_snapshots(droplet_id) do {:ok, snapshots} -> case Enum.find(snapshots, &(&1["name"] == name)) do nil -> :not_found snap -> {:ok, snap} end {:error, reason} -> {:error, reason} end end defp poll_action(droplet_id, action_id, attempt \\ 1) defp poll_action(_droplet_id, _action_id, attempt) when attempt > @poll_max_attempts do {:error, :snapshot_action_timeout} end defp poll_action(droplet_id, action_id, attempt) do case Client.get_droplet_action(droplet_id, action_id) do {:ok, %{"status" => "completed"}} -> {:ok, :completed} {:ok, %{"status" => "errored"}} -> {:error, :snapshot_action_errored} {:ok, %{"status" => "in-progress"}} -> Process.sleep(@poll_interval_ms) poll_action(droplet_id, action_id, attempt + 1) {:error, reason} -> {:error, reason} end end end