feat(scaletest): add scaletest-runner template (#9662)

Closes #9571
2023-09-15 16:23:37 +03:00 · 2023-09-15 16:23:37 +03:00 · bc97eaa41b
parent d0d64bbdca
commit bc97eaa41b
14 changed files with 956 additions and 1 deletions
--- a/cli/exp_scaletest.go
+++ b/cli/exp_scaletest.go
@ -857,7 +857,7 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd {
 			Flag:        "use-host-login",
 			Env:         "CODER_SCALETEST_USE_HOST_LOGIN",
 			Default:     "false",
-			Description: "Use the use logged in on the host machine, instead of creating users.",
+			Description: "Use the user logged in on the host machine, instead of creating users.",
 			Value:       clibase.BoolOf(&useHostUser),
 		},
 	}
--- a/scaletest/setup/scaletest-sa.yaml
+++ b/scaletest/setup/scaletest-sa.yaml
@ -0,0 +1,51 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: scaletest-sa
+  namespace: coder-big
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: scaletest-role
+  namespace: coder-big
+rules:
+  - apiGroups:
+      - ""
+    resources: ["*"]
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - patch
+      - delete
+      - deletecollection
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: scaletest-rolebinding
+  namespace: coder-big
+subjects:
+  - kind: ServiceAccount
+    name: scaletest-sa
+roleRef:
+  kind: Role
+  name: scaletest-role
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: coder-provisioner-podmonitor-rolebinding
+  namespace: coder-big
+subjects:
+  - kind: ServiceAccount
+    name: coder-provisioner
+roleRef:
+  kind: Role
+  name: coder-podmonitor
+---
+
--- a/scaletest/templates/scaletest-runner/Dockerfile
+++ b/scaletest/templates/scaletest-runner/Dockerfile
@ -0,0 +1,36 @@
+# This image is used to run scaletest jobs and, although it is inside
+# the template directory, it is built separately and pushed to
+# gcr.io/coder-dev-1/scaletest-runner:latest.
+#
+# Future improvements will include versioning and including the version
+# in the template push.
+
+FROM codercom/enterprise-base:ubuntu
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+USER root
+
+# TODO(mafredri): Remove unneeded dependencies once we have a clear idea of what's needed.
+RUN wget --quiet -O /tmp/terraform.zip https://releases.hashicorp.com/terraform/1.5.7/terraform_1.5.7_linux_amd64.zip \
+	&& unzip /tmp/terraform.zip -d /usr/local/bin \
+	&& rm /tmp/terraform.zip \
+	&& terraform --version
+
+RUN wget --quiet -O /tmp/envsubst "https://github.com/a8m/envsubst/releases/download/v1.2.0/envsubst-$(uname -s)-$(uname -m)" \
+	&& chmod +x /tmp/envsubst \
+	&& mv /tmp/envsubst /usr/local/bin
+
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \
+	&& curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \
+	&& apt-get update \
+	&& apt-get install --yes \
+	google-cloud-cli \
+	jq \
+	kubectl \
+	zstd \
+	&& gcloud --version \
+	&& kubectl version --client \
+	&& rm -rf /var/lib/apt/lists/*
+
+USER coder
--- a/scaletest/templates/scaletest-runner/README.md
+++ b/scaletest/templates/scaletest-runner/README.md
@ -0,0 +1,9 @@
+---
+name: Scaletest Runner
+description: Run a scaletest.
+tags: [local]
+---
+
+# Scaletest Runner
+
+Run a scaletest.
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@ -0,0 +1,531 @@
+terraform {
+  required_providers {
+    coder = {
+      source  = "coder/coder"
+      version = "~> 0.11"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = "~> 2.22"
+    }
+  }
+}
+
+resource "time_static" "start_time" {
+  # We con't set `count = data.coder_workspace.me.start_count` here because then
+  # we can't use this value in `locals`. The permission check is recreated on
+  # start, which will update the timestamp.
+  triggers = {
+    count : length(null_resource.permission_check)
+  }
+}
+
+resource "null_resource" "permission_check" {
+  count = data.coder_workspace.me.start_count
+
+  # Limit which users can create a workspace in this template.
+  # The "default" user and workspace are present because they are needed
+  # for the plan, and consequently, updating the template.
+  lifecycle {
+    precondition {
+      condition     = can(regex("^(default/default|scaletest/runner)$", "${data.coder_workspace.me.owner}/${data.coder_workspace.me.name}"))
+      error_message = "User and workspace name is not allowed, expected 'scaletest/runner'."
+    }
+  }
+}
+
+locals {
+  workspace_pod_name     = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
+  workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
+  service_account_name   = "scaletest-sa"
+  cpu                    = 2
+  memory                 = 2
+  home_disk_size         = 10
+  scaletest_run_id       = "scaletest-${time_static.start_time.rfc3339}"
+  scaletest_run_dir      = "/home/coder/${local.scaletest_run_id}"
+}
+
+data "coder_provisioner" "me" {
+}
+
+data "coder_workspace" "me" {
+}
+
+data "coder_parameter" "verbose" {
+  order       = 1
+  type        = "bool"
+  name        = "Verbose"
+  default     = false
+  description = "Show debug output."
+  mutable     = true
+  ephemeral   = true
+}
+
+data "coder_parameter" "dry_run" {
+  order       = 2
+  type        = "bool"
+  name        = "Dry-run"
+  default     = true
+  description = "Perform a dry-run to see what would happen."
+  mutable     = true
+  ephemeral   = true
+}
+
+data "coder_parameter" "create_concurrency" {
+  order       = 10
+  type        = "number"
+  name        = "Create concurrency"
+  default     = 10
+  description = "The number of workspaces to create concurrently."
+  mutable     = true
+
+  # Setting zero = unlimited, but perhaps not a good idea,
+  # we can raise this limit instead.
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "job_concurrency" {
+  order       = 11
+  type        = "number"
+  name        = "Job concurrency"
+  default     = 10
+  description = "The number of concurrent jobs (e.g. when producing workspace traffic)."
+  mutable     = true
+
+  # Setting zero = unlimited, but perhaps not a good idea,
+  # we can raise this limit instead.
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "cleanup_concurrency" {
+  order       = 12
+  type        = "number"
+  name        = "Cleanup concurrency"
+  default     = 10
+  description = "The number of concurrent cleanup jobs."
+  mutable     = true
+
+  # Setting zero = unlimited, but perhaps not a good idea,
+  # we can raise this limit instead.
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "cleanup_strategy" {
+  order       = 13
+  name        = "Cleanup strategy"
+  default     = "always"
+  description = "The strategy used to cleanup workspaces after the scaletest is complete."
+  mutable     = true
+  ephemeral   = true
+  option {
+    name        = "Always"
+    value       = "always"
+    description = "Automatically cleanup workspaces after the scaletest ends."
+  }
+  option {
+    name        = "On stop"
+    value       = "on_stop"
+    description = "Cleanup workspaces when the workspace is stopped."
+  }
+  option {
+    name        = "On success"
+    value       = "on_success"
+    description = "Automatically cleanup workspaces after the scaletest is complete if no error occurs."
+  }
+  option {
+    name        = "On error"
+    value       = "on_error"
+    description = "Automatically cleanup workspaces after the scaletest is complete if an error occurs."
+  }
+}
+
+
+data "coder_parameter" "workspace_template" {
+  order        = 20
+  name         = "workspace_template"
+  display_name = "Workspace Template"
+  description  = "The template used for workspace creation."
+  default      = "kubernetes-minimal"
+  icon         = "/emojis/1f4dc.png" # Scroll.
+  mutable      = true
+  option {
+    name        = "Minimal"
+    value       = "kubernetes-minimal" # Feather.
+    icon        = "/emojis/1fab6.png"
+    description = "Sized to fit approx. 32 per t2d-standard-8 instance."
+  }
+  option {
+    name        = "Small"
+    value       = "kubernetes-small"
+    icon        = "/emojis/1f42d.png" # Mouse.
+    description = "Provisions a small-sized workspace with no persistent storage."
+  }
+  option {
+    name        = "Medium"
+    value       = "kubernetes-medium"
+    icon        = "/emojis/1f436.png" # Dog.
+    description = "Provisions a medium-sized workspace with no persistent storage."
+  }
+  option {
+    name        = "Large"
+    value       = "kubernetes-large"
+    icon        = "/emojis/1f434.png" # Horse.
+    description = "Provisions a large-sized workspace with no persistent storage."
+  }
+}
+
+data "coder_parameter" "num_workspaces" {
+  order       = 21
+  type        = "number"
+  name        = "Number of workspaces to create"
+  default     = 100
+  description = "The scaletest suite will create this number of workspaces."
+  mutable     = true
+
+  validation {
+    min = 0
+    max = 1000
+  }
+}
+
+data "coder_parameter" "namespace" {
+  order       = 999
+  type        = "string"
+  name        = "Namespace"
+  default     = "coder-big"
+  description = "The Kubernetes namespace to create the scaletest runner resources in."
+}
+
+data "archive_file" "scripts_zip" {
+  type        = "zip"
+  output_path = "${path.module}/scripts.zip"
+  source_dir  = "${path.module}/scripts"
+}
+
+resource "coder_agent" "main" {
+  arch = data.coder_provisioner.me.arch
+  dir  = local.scaletest_run_dir
+  os   = "linux"
+  env = {
+    VERBOSE : data.coder_parameter.verbose.value ? "1" : "0",
+    DRY_RUN : data.coder_parameter.dry_run.value ? "1" : "0",
+    CODER_CONFIG_DIR : "/home/coder/.config/coderv2",
+    CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token,
+    CODER_URL : data.coder_workspace.me.access_url,
+
+    # Global scaletest envs that may affect each `coder exp scaletest` invocation.
+    CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112",
+    CODER_SCALETEST_PROMETHEUS_WAIT : "60s",
+    CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
+    CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
+
+    # Local envs passed as arguments to `coder exp scaletest` invocations.
+    SCALETEST_RUN_ID : local.scaletest_run_id,
+    SCALETEST_RUN_DIR : local.scaletest_run_dir,
+    SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value,
+    SCALETEST_SKIP_CLEANUP : "1",
+    SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
+    SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
+    SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+
+    SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
+    SCRIPTS_DIR : "/tmp/scripts",
+  }
+  display_apps {
+    vscode     = false
+    ssh_helper = false
+  }
+  startup_script_timeout  = 3600
+  shutdown_script_timeout = 1800
+  startup_script_behavior = "blocking"
+  startup_script          = file("startup.sh")
+  shutdown_script         = file("shutdown.sh")
+
+  # Scaletest metadata.
+  metadata {
+    display_name = "Scaletest status"
+    key          = "00_scaletest_status"
+    script       = file("metadata_status.sh")
+    interval     = 1
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "Scaletest phase"
+    key          = "01_scaletest_phase"
+    script       = file("metadata_phase.sh")
+    interval     = 1
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "Scaletest phase (previous)"
+    key          = "02_scaletest_previous_phase"
+    script       = file("metadata_previous_phase.sh")
+    interval     = 1
+    timeout      = 1
+  }
+
+  # Misc workspace metadata.
+  metadata {
+    display_name = "CPU Usage"
+    key          = "80_cpu_usage"
+    script       = "coder stat cpu"
+    interval     = 10
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "RAM Usage"
+    key          = "81_ram_usage"
+    script       = "coder stat mem"
+    interval     = 10
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "Home Disk"
+    key          = "82_home_disk"
+    script       = "coder stat disk --path $${HOME}"
+    interval     = 60
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "CPU Usage (Host)"
+    key          = "83_cpu_usage_host"
+    script       = "coder stat cpu --host"
+    interval     = 10
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "Memory Usage (Host)"
+    key          = "84_mem_usage_host"
+    script       = "coder stat mem --host"
+    interval     = 10
+    timeout      = 1
+  }
+
+  metadata {
+    display_name = "Load Average (Host)"
+    key          = "85_load_host"
+    # Get load avg scaled by number of cores.
+    script   = <<-EOS
+      echo "`cat /proc/loadavg | awk '{ print $1 }'` `nproc`" | awk '{ printf "%0.2f", $1/$2 }'
+    EOS
+    interval = 60
+    timeout  = 1
+  }
+}
+
+resource "coder_app" "grafana" {
+  agent_id     = coder_agent.main.id
+  slug         = "00-grafana"
+  display_name = "Grafana"
+  url          = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
+  icon         = "https://grafana.com/static/assets/img/fav32.png"
+  external     = true
+}
+
+resource "coder_app" "prometheus" {
+  agent_id     = coder_agent.main.id
+  slug         = "01-prometheus"
+  display_name = "Prometheus"
+  // https://stats.dev.c8s.io:9443/classic/graph?g0.range_input=2h&g0.end_input=2023-09-08%2015%3A58&g0.stacked=0&g0.expr=rate(pg_stat_database_xact_commit%7Bcluster%3D%22big%22%2Cdatname%3D%22big-coder%22%7D%5B1m%5D)&g0.tab=0
+  url      = "https://stats.dev.c8s.io:9443"
+  icon     = "https://prometheus.io/assets/favicons/favicon-32x32.png"
+  external = true
+}
+
+resource "coder_app" "manual_cleanup" {
+  agent_id     = coder_agent.main.id
+  slug         = "02-manual-cleanup"
+  display_name = "Manual cleanup"
+  icon         = "/emojis/1f9f9.png"
+  command      = "/tmp/scripts/cleanup.sh manual"
+}
+
+resource "kubernetes_persistent_volume_claim" "home" {
+  depends_on = [null_resource.permission_check]
+  metadata {
+    name      = "${local.workspace_pod_name}-home"
+    namespace = data.coder_parameter.namespace.value
+    labels = {
+      "app.kubernetes.io/name"     = "coder-pvc"
+      "app.kubernetes.io/instance" = "coder-pvc-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
+      "app.kubernetes.io/part-of"  = "coder"
+      // Coder specific labels.
+      "com.coder.resource"       = "true"
+      "com.coder.workspace.id"   = data.coder_workspace.me.id
+      "com.coder.workspace.name" = data.coder_workspace.me.name
+      "com.coder.user.id"        = data.coder_workspace.me.owner_id
+      "com.coder.user.username"  = data.coder_workspace.me.owner
+    }
+    annotations = {
+      "com.coder.user.email" = data.coder_workspace.me.owner_email
+    }
+  }
+  wait_until_bound = false
+  spec {
+    access_modes = ["ReadWriteOnce"]
+    resources {
+      requests = {
+        storage = "${local.home_disk_size}Gi"
+      }
+    }
+  }
+}
+
+resource "kubernetes_pod" "main" {
+  depends_on = [null_resource.permission_check]
+  count      = data.coder_workspace.me.start_count
+  metadata {
+    name      = local.workspace_pod_name
+    namespace = data.coder_parameter.namespace.value
+    labels = {
+      "app.kubernetes.io/name"     = "coder-workspace"
+      "app.kubernetes.io/instance" = local.workspace_pod_instance
+      "app.kubernetes.io/part-of"  = "coder"
+      // Coder specific labels.
+      "com.coder.resource"       = "true"
+      "com.coder.workspace.id"   = data.coder_workspace.me.id
+      "com.coder.workspace.name" = data.coder_workspace.me.name
+      "com.coder.user.id"        = data.coder_workspace.me.owner_id
+      "com.coder.user.username"  = data.coder_workspace.me.owner
+    }
+    annotations = {
+      "com.coder.user.email" = data.coder_workspace.me.owner_email
+    }
+  }
+  # Set the pod delete timeout to termination_grace_period_seconds + 1m.
+  timeouts {
+    delete = "32m"
+  }
+  spec {
+    security_context {
+      run_as_user = "1000"
+      fs_group    = "1000"
+    }
+
+    # Allow this pod to perform scale tests.
+    service_account_name = local.service_account_name
+
+    # Allow the coder agent to perform graceful shutdown and cleanup of
+    # scaletest resources, 30 minutes (cleanup timeout) + 1 minute.
+    termination_grace_period_seconds = 1860
+
+    container {
+      name              = "dev"
+      image             = "gcr.io/coder-dev-1/scaletest-runner:latest"
+      image_pull_policy = "Always"
+      command           = ["sh", "-c", coder_agent.main.init_script]
+      security_context {
+        run_as_user = "1000"
+      }
+      env {
+        name  = "CODER_AGENT_TOKEN"
+        value = coder_agent.main.token
+      }
+      env {
+        name  = "CODER_AGENT_LOG_DIR"
+        value = "${local.scaletest_run_dir}/logs"
+      }
+      resources {
+        # Set requests and limits values such that we can do performant
+        # execution of `coder scaletest` commands.
+        requests = {
+          "cpu"    = "250m"
+          "memory" = "512Mi"
+        }
+        limits = {
+          "cpu"    = "${local.cpu}"
+          "memory" = "${local.memory}Gi"
+        }
+      }
+      volume_mount {
+        mount_path = "/home/coder"
+        name       = "home"
+        read_only  = false
+      }
+      port {
+        container_port = 21112
+        name           = "prometheus-http"
+        protocol       = "TCP"
+      }
+    }
+
+    volume {
+      name = "home"
+      persistent_volume_claim {
+        claim_name = kubernetes_persistent_volume_claim.home.metadata.0.name
+        read_only  = false
+      }
+    }
+
+    affinity {
+      pod_anti_affinity {
+        // This affinity attempts to spread out all workspace pods evenly across
+        // nodes.
+        preferred_during_scheduling_ignored_during_execution {
+          weight = 1
+          pod_affinity_term {
+            topology_key = "kubernetes.io/hostname"
+            label_selector {
+              match_expressions {
+                key      = "app.kubernetes.io/name"
+                operator = "In"
+                values   = ["coder-workspace"]
+              }
+            }
+          }
+        }
+      }
+      node_affinity {
+        required_during_scheduling_ignored_during_execution {
+          node_selector_term {
+            match_expressions {
+              key      = "cloud.google.com/gke-nodepool"
+              operator = "In"
+              values   = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces.
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_manifest" "pod_monitor" {
+  count = data.coder_workspace.me.start_count
+  manifest = {
+    apiVersion = "monitoring.coreos.com/v1"
+    kind       = "PodMonitor"
+    metadata = {
+      namespace = data.coder_parameter.namespace.value
+      name      = "podmonitor-${local.workspace_pod_name}"
+    }
+    spec = {
+      selector = {
+        matchLabels = {
+          "app.kubernetes.io/instance" : local.workspace_pod_instance
+        }
+      }
+      podMetricsEndpoints = [
+        {
+          port     = "prometheus-http"
+          interval = "15s"
+        }
+      ]
+    }
+  }
+}
--- a/scaletest/templates/scaletest-runner/metadata_phase.sh
+++ b/scaletest/templates/scaletest-runner/metadata_phase.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+get_phase
--- a/scaletest/templates/scaletest-runner/metadata_previous_phase.sh
+++ b/scaletest/templates/scaletest-runner/metadata_previous_phase.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh" 2>/dev/null || return
+
+get_previous_phase
--- a/scaletest/templates/scaletest-runner/metadata_status.sh
+++ b/scaletest/templates/scaletest-runner/metadata_status.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh" 2>/dev/null || return
+
+get_status
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+set -euo pipefail
+
+[[ $VERBOSE == 1 ]] && set -x
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+event=${1:-}
+
+if [[ -z $event ]]; then
+	event=manual
+fi
+
+if [[ $event = manual ]]; then
+	echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
+	read -r -n 1
+	if [[ $REPLY != [yY] ]]; then
+		echo $'\nAborting...'
+		exit 1
+	fi
+fi
+
+start_phase "Cleanup (${event})"
+coder exp scaletest cleanup \
+	--cleanup-job-timeout 15m \
+	--cleanup-timeout 30m |
+	tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
+end_phase
+
+if [[ $event = manual ]]; then
+	echo 'Press any key to continue...'
+	read -s -r -n 1
+fi
--- a/scaletest/templates/scaletest-runner/scripts/lib.sh
+++ b/scaletest/templates/scaletest-runner/scripts/lib.sh
@ -0,0 +1,94 @@
+#!/bin/bash
+set -euo pipefail
+
+# Only source this script once, this env comes from sourcing
+# scripts/lib.sh from coder/coder below.
+if [[ ${SCRIPTS_LIB_IS_SOURCED:-0} == 1 ]]; then
+	return 0
+fi
+
+# Source scripts/lib.sh from coder/coder for common functions.
+# shellcheck source=scripts/lib.sh
+. "${HOME}/coder/scripts/lib.sh"
+
+# Make shellcheck happy.
+DRY_RUN=${DRY_RUN:-0}
+
+# Environment variables shared between scripts.
+SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
+SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
+# shellcheck disable=SC2034
+SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
+
+coder() {
+	maybedryrun "${DRY_RUN}" command coder "${@}"
+}
+
+show_json() {
+	maybedryrun "${DRY_RUN}" jq 'del(.. | .logs?)' "${1}"
+}
+
+set_status() {
+	dry_run=
+	if [[ ${DRY_RUN} == 1 ]]; then
+		dry_run=" (dry-ryn)"
+	fi
+	echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status"
+}
+lock_status() {
+	chmod 0440 "${SCALETEST_STATE_DIR}/status"
+}
+get_status() {
+	# Order of importance (reverse of creation).
+	if [[ -f "${SCALETEST_STATE_DIR}/status" ]]; then
+		tail -n1 "${SCALETEST_STATE_DIR}/status" | cut -d' ' -f2-
+	else
+		echo "Not started"
+	fi
+}
+
+phase_num=0
+start_phase() {
+	# This may be incremented from another script, so we read it every time.
+	if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
+		phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")"
+	fi
+	phase_num=$((phase_num + 1))
+	log "Start phase ${phase_num}: ${*}"
+	echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}"
+}
+end_phase() {
+	phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)"
+	if [[ -z ${phase} ]]; then
+		log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}"
+		exit 1
+	fi
+	log "End phase ${phase_num}: ${phase}"
+	echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}"
+}
+get_phase() {
+	if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
+		phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")"
+		phase="$(echo "${phase_raw}" | cut -d' ' -f3-)"
+		if [[ ${phase_raw} == *"END:"* ]]; then
+			phase+=" [done]"
+		fi
+		echo "${phase}"
+	else
+		echo "None"
+	fi
+}
+get_previous_phase() {
+	if [[ -f "${SCALETEST_PHASE_FILE}" ]] && [[ $(grep -c START: "${SCALETEST_PHASE_FILE}") -gt 1 ]]; then
+		grep START: "${SCALETEST_PHASE_FILE}" | tail -n2 | head -n1 | cut -d' ' -f3-
+	else
+		echo "None"
+	fi
+}
+
+wait_baseline() {
+	s=${1:-2}
+	start_phase "Waiting ${s}m to establish baseline"
+	maybedryrun "$DRY_RUN" sleep $((s * 60))
+	end_phase
+}
--- a/scaletest/templates/scaletest-runner/scripts/prepare.sh
+++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+set -euo pipefail
+
+[[ $VERBOSE == 1 ]] && set -x
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+mkdir -p "${SCALETEST_STATE_DIR}"
+mkdir -p "${SCALETEST_RESULTS_DIR}"
+
+log "Preparing scaletest workspace environment..."
+set_status Preparing
+
+log "Compressing previous run logs (if applicable)..."
+mkdir -p "${HOME}/archive"
+for dir in "${HOME}/scaletest-"*; do
+	if [[ ${dir} = "${SCALETEST_RUN_DIR}" ]]; then
+		continue
+	fi
+	if [[ -d ${dir} ]]; then
+		name="$(basename "${dir}")"
+		(
+			cd "$(dirname "${dir}")"
+			ZSTD_CLEVEL=12 maybedryrun "$DRY_RUN" tar --zstd -cf "${HOME}/archive/${name}.tar.zst" "${name}"
+		)
+		maybedryrun "$DRY_RUN" rm -rf "${dir}"
+	fi
+done
+
+log "Cloning coder/coder repo..."
+
+if [[ ! -d "${HOME}/coder" ]]; then
+	git clone https://github.com/coder/coder.git "${HOME}/coder"
+fi
+(cd "${HOME}/coder" && git pull)
+
+log "Creating coder CLI token (needed for cleanup during shutdown)..."
+
+mkdir -p "${CODER_CONFIG_DIR}"
+echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
+
+set +x # Avoid logging the token.
+# Persist configuration for shutdown script too since the
+# owner token is invalidated immediately on workspace stop.
+export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
+coder tokens delete scaletest_runner >/dev/null 2>&1 || true
+# TODO(mafredri): Set TTL? This could interfere with delayed stop though.
+token=$(coder tokens create --name scaletest_runner)
+unset CODER_SESSION_TOKEN
+echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
+[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
+
+log "Cleaning up from previous runs (if applicable)..."
+"${SCRIPTS_DIR}/cleanup.sh" "prepare"
+
+log "Preparation complete!"
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+set -euo pipefail
+
+[[ $VERBOSE == 1 ]] && set -x
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+log "Running scaletest..."
+set_status Running
+
+start_phase "Creating workspaces"
+coder exp scaletest create-workspaces \
+	--count "${SCALETEST_NUM_WORKSPACES}" \
+	--template "${SCALETEST_TEMPLATE}" \
+	--concurrency "${SCALETEST_CREATE_CONCURRENCY}" \
+	--job-timeout 15m \
+	--no-cleanup \
+	--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+end_phase
+
+wait_baseline 5
+
+start_phase "SSH traffic"
+coder exp scaletest workspace-traffic \
+	--ssh \
+	--bytes-per-tick 10240 \
+	--tick-interval 1s \
+	--timeout 5m \
+	--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+end_phase
+
+wait_baseline 5
+
+start_phase "ReconnectingPTY traffic"
+coder exp scaletest workspace-traffic \
+	--bytes-per-tick 10240 \
+	--tick-interval 1s \
+	--timeout 5m \
+	--output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
+show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
+end_phase
+
+wait_baseline 5
+
+start_phase "Dashboard traffic"
+coder exp scaletest dashboard \
+	--count "${SCALETEST_NUM_WORKSPACES}" \
+	--job-timeout 5m \
+	--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+end_phase
+
+wait_baseline 5
+
+log "Scaletest complete!"
+set_status Complete
--- a/scaletest/templates/scaletest-runner/shutdown.sh
+++ b/scaletest/templates/scaletest-runner/shutdown.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+[[ $VERBOSE == 1 ]] && set -x
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+cleanup() {
+	coder tokens remove scaletest_runner >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
+
+"${SCRIPTS_DIR}/cleanup.sh" shutdown
--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+set -euo pipefail
+
+[[ $VERBOSE == 1 ]] && set -x
+
+# Unzip scripts and add to path.
+# shellcheck disable=SC2153
+echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."
+base64 -d <<<"${SCRIPTS_ZIP}" >/tmp/scripts.zip
+rm -rf "${SCRIPTS_DIR}" || true
+mkdir -p "${SCRIPTS_DIR}"
+unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}"
+rm /tmp/scripts.zip
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+# Show failure in the UI if script exits with error.
+failed_status=Failed
+on_exit() {
+	trap - ERR EXIT
+
+	case "${SCALETEST_CLEANUP_STRATEGY}" in
+	on_stop)
+		# Handled by shutdown script.
+		;;
+	on_success)
+		if [[ $(get_status) != "${failed_status}" ]]; then
+			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+		fi
+		;;
+	on_error)
+		if [[ $(get_status) = "${failed_status}" ]]; then
+			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+		fi
+		;;
+	*)
+		"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+		;;
+	esac
+}
+trap on_exit EXIT
+
+on_err() {
+	log "Scaletest failed!"
+	set_status "${failed_status}"
+	lock_status # Ensure we never rewrite the status after a failure.
+}
+trap on_err ERR
+
+"${SCRIPTS_DIR}/prepare.sh"
+"${SCRIPTS_DIR}/run.sh"