feat(scaletest): add grafana annotations and slack reporting (#9852)

Fixes #9575 Fixes #9576
2023-09-27 14:44:11 +03:00 · 2023-09-27 14:44:11 +03:00 · d8515f02af
parent 4e442040f7
commit d8515f02af
8 changed files with 495 additions and 78 deletions
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@ -35,14 +35,18 @@ resource "null_resource" "permission_check" {
 }

 locals {
-  workspace_pod_name     = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
-  workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
-  service_account_name   = "scaletest-sa"
-  cpu                    = 2
-  memory                 = 2
-  home_disk_size         = 10
-  scaletest_run_id       = "scaletest-${time_static.start_time.rfc3339}"
-  scaletest_run_dir      = "/home/coder/${local.scaletest_run_id}"
+  workspace_pod_name                             = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
+  workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
+  workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
+  service_account_name                           = "scaletest-sa"
+  cpu                                            = 16
+  memory                                         = 64
+  home_disk_size                                 = 10
+  scaletest_run_id                               = "scaletest-${time_static.start_time.rfc3339}"
+  scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"
+  grafana_url                                    = "https://stats.dev.c8s.io"
+  grafana_dashboard_uid                          = "qLVSTR-Vz"
+  grafana_dashboard_name                         = "coderv2-loadtest-dashboard"
 }

 data "coder_provisioner" "me" {
@ -91,15 +95,14 @@ data "coder_parameter" "job_concurrency" {
  order       = 11
  type        = "number"
  name        = "Job concurrency"
-  default     = 10
+  default     = 0
  description = "The number of concurrent jobs (e.g. when producing workspace traffic)."
  mutable     = true

  # Setting zero = unlimited, but perhaps not a good idea,
  # we can raise this limit instead.
  validation {
-    min = 1
-    max = 100
+    min = 0
  }
 }

@ -197,6 +200,121 @@ data "coder_parameter" "num_workspaces" {
  }
 }

+
+data "coder_parameter" "load_scenarios" {
+  order       = 22
+  name        = "Load Scenarios"
+  type        = "list(string)"
+  description = "The load scenarios to run."
+  mutable     = true
+  ephemeral   = true
+  default = jsonencode([
+    "SSH Traffic",
+    "Web Terminal Traffic",
+    "Dashboard Traffic",
+  ])
+}
+
+data "coder_parameter" "load_scenario_ssh_traffic_duration" {
+  order       = 23
+  name        = "SSH Traffic Duration"
+  type        = "number"
+  description = "The duration of the SSH traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
+  order       = 24
+  name        = "SSH Bytes Per Tick"
+  type        = "number"
+  description = "The number of bytes to send per tick in the SSH traffic load scenario."
+  mutable     = true
+  default     = 1024
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_ssh_tick_interval" {
+  order       = 25
+  name        = "SSH Tick Interval"
+  type        = "number"
+  description = "The number of milliseconds between each tick in the SSH traffic load scenario."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
+  order       = 26
+  name        = "Web Terminal Traffic Duration"
+  type        = "number"
+  description = "The duration of the web terminal traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
+  order       = 27
+  name        = "Web Terminal Bytes Per Tick"
+  type        = "number"
+  description = "The number of bytes to send per tick in the web terminal traffic load scenario."
+  mutable     = true
+  default     = 1024
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
+  order       = 28
+  name        = "Web Terminal Tick Interval"
+  type        = "number"
+  description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
+  order       = 29
+  name        = "Dashboard Traffic Duration"
+  type        = "number"
+  description = "The duration of the dashboard traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_baseline_duration" {
+  order       = 26
+  name        = "Baseline Wait Duration"
+  type        = "number"
+  description = "The duration to wait before starting a load scenario in minutes."
+  mutable     = true
+  default     = 5
+  validation {
+    min = 0
+    max = 60
+  }
+}
+
 data "coder_parameter" "namespace" {
  order       = 999
  type        = "string"
@ -221,6 +339,8 @@ resource "coder_agent" "main" {
    CODER_CONFIG_DIR : "/home/coder/.config/coderv2",
    CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token,
    CODER_URL : data.coder_workspace.me.access_url,
+    CODER_USER : data.coder_workspace.me.owner,
+    CODER_WORKSPACE : data.coder_workspace.me.name,

    # Global scaletest envs that may affect each `coder exp scaletest` invocation.
    CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112",
@ -228,14 +348,29 @@ resource "coder_agent" "main" {
    CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
    CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",

+    # Expose as params as well, for reporting (TODO(mafredri): refactor, only have one).
+    SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
+    SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
+
    # Local envs passed as arguments to `coder exp scaletest` invocations.
    SCALETEST_RUN_ID : local.scaletest_run_id,
    SCALETEST_RUN_DIR : local.scaletest_run_dir,
-    SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value,
-    SCALETEST_SKIP_CLEANUP : "1",
-    SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
-    SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
-    SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+
+    SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
+    SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
+    SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
+    SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+    SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
+
+    GRAFANA_URL : local.grafana_url,

    SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
    SCRIPTS_DIR : "/tmp/scripts",
@ -244,12 +379,13 @@ resource "coder_agent" "main" {
    vscode     = false
    ssh_helper = false
  }
-  startup_script_timeout  = 3600
-  shutdown_script_timeout = 1800
+  startup_script_timeout  = 86400
+  shutdown_script_timeout = 7200
  startup_script_behavior = "blocking"
  startup_script          = file("startup.sh")
  shutdown_script         = file("shutdown.sh")

+  # IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item.
  # Scaletest metadata.
  metadata {
    display_name = "Scaletest status"
@ -332,7 +468,7 @@ resource "coder_app" "grafana" {
  agent_id     = coder_agent.main.id
  slug         = "00-grafana"
  display_name = "Grafana"
-  url          = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
+  url          = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
  icon         = "https://grafana.com/static/assets/img/fav32.png"
  external     = true
 }
@ -409,7 +545,7 @@ resource "kubernetes_pod" "main" {
  }
  # Set the pod delete timeout to termination_grace_period_seconds + 1m.
  timeouts {
-    delete = "32m"
+    delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s"
  }
  spec {
    security_context {
@ -421,8 +557,9 @@ resource "kubernetes_pod" "main" {
    service_account_name = local.service_account_name

    # Allow the coder agent to perform graceful shutdown and cleanup of
-    # scaletest resources, 30 minutes (cleanup timeout) + 1 minute.
-    termination_grace_period_seconds = 1860
+    # scaletest resources. We add an extra minute so ensure work
+    # completion is prioritized over timeout.
+    termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60

    container {
      name              = "dev"
@ -440,6 +577,24 @@ resource "kubernetes_pod" "main" {
        name  = "CODER_AGENT_LOG_DIR"
        value = "${local.scaletest_run_dir}/logs"
      }
+      env {
+        name = "GRAFANA_API_TOKEN"
+        value_from {
+          secret_key_ref {
+            name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name
+            key  = "token"
+          }
+        }
+      }
+      env {
+        name = "SLACK_WEBHOOK_URL"
+        value_from {
+          secret_key_ref {
+            name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name
+            key  = "url"
+          }
+        }
+      }
      resources {
        # Set requests and limits values such that we can do performant
        # execution of `coder scaletest` commands.
@ -496,7 +651,7 @@ resource "kubernetes_pod" "main" {
            match_expressions {
              key      = "cloud.google.com/gke-nodepool"
              operator = "In"
-              values   = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces.
+              values   = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces.
            }
          }
        }
@ -505,6 +660,20 @@ resource "kubernetes_pod" "main" {
  }
 }

+data "kubernetes_secret" "grafana_editor_api_token" {
+  metadata {
+    name      = "grafana-editor-api-token"
+    namespace = data.coder_parameter.namespace.value
+  }
+}
+
+data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" {
+  metadata {
+    name      = "slack-scaletest-notifications-webhook-url"
+    namespace = data.coder_parameter.namespace.value
+  }
+}
+
 resource "kubernetes_manifest" "pod_monitor" {
  count = data.coder_workspace.me.start_count
  manifest = {
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@ -24,7 +24,7 @@ fi
 start_phase "Cleanup (${event})"
 coder exp scaletest cleanup \
 	--cleanup-job-timeout 15m \
-	--cleanup-timeout 30m |
+	--cleanup-timeout 2h |
 	tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
 end_phase

--- a/scaletest/templates/scaletest-runner/scripts/lib.sh
+++ b/scaletest/templates/scaletest-runner/scripts/lib.sh
@ -33,7 +33,13 @@ set_status() {
 	if [[ ${DRY_RUN} == 1 ]]; then
 		dry_run=" (dry-ryn)"
 	fi
+	prev_status=$(get_status)
+	if [[ ${prev_status} != *"Not started"* ]]; then
+		annotate_grafana_end "status" "Status: ${prev_status}"
+	fi
 	echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status"
+
+	annotate_grafana "status" "Status: ${*}"
 }
 lock_status() {
 	chmod 0440 "${SCALETEST_STATE_DIR}/status"
@ -51,25 +57,29 @@ phase_num=0
 start_phase() {
 	# This may be incremented from another script, so we read it every time.
 	if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
-		phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")"
+		phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}")
 	fi
 	phase_num=$((phase_num + 1))
 	log "Start phase ${phase_num}: ${*}"
 	echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}"
+
+	GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}"
 }
 end_phase() {
-	phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)"
+	phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)
 	if [[ -z ${phase} ]]; then
 		log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}"
 		exit 1
 	fi
 	log "End phase ${phase_num}: ${phase}"
 	echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}"
+
+	GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}"
 }
 get_phase() {
 	if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
-		phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")"
-		phase="$(echo "${phase_raw}" | cut -d' ' -f3-)"
+		phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}")
+		phase=$(echo "${phase_raw}" | cut -d' ' -f3-)
 		if [[ ${phase_raw} == *"END:"* ]]; then
 			phase+=" [done]"
 		fi
@ -86,9 +96,117 @@ get_previous_phase() {
 	fi
 }

+annotate_grafana() {
+	local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))}
+	local json resp id
+
+	if [[ -z $tags ]]; then
+		tags="scaletest,runner"
+	else
+		tags="scaletest,runner,${tags}"
+	fi
+	if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
+		tags="${tags},${GRAFANA_EXTRA_TAGS}"
+	fi
+
+	log "Annotating Grafana (start=${start}): ${text} [${tags}]"
+
+	json="$(
+		jq \
+			--argjson time "${start}" \
+			--arg text "${text}" \
+			--arg tags "${tags}" \
+			'{time: $time, tags: $tags | split(","), text: $text}' <<<'{}'
+	)"
+	if [[ ${DRY_RUN} == 1 ]]; then
+		log "Would have annotated Grafana, data=${json}"
+		return 0
+	fi
+	if ! resp="$(
+		curl -sSL \
+			--insecure \
+			-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
+			-H "Content-Type: application/json" \
+			-d "${json}" \
+			"${GRAFANA_URL}/api/annotations"
+	)"; then
+		# Don't abort scaletest just because we couldn't annotate Grafana.
+		log "Failed to annotate Grafana: ${resp}"
+		return 0
+	fi
+
+	if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then
+		log "Failed to annotate Grafana: ${resp}"
+		return 0
+	fi
+
+	log "Grafana annotation added!"
+
+	if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
+		mkdir -p "${SCALETEST_STATE_DIR}"
+	fi
+	id="$(jq -r '.id' <<<"${resp}")"
+	echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
+}
+annotate_grafana_end() {
+	local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))}
+	local id json resp
+
+	if [[ -z $tags ]]; then
+		tags="scaletest,runner"
+	else
+		tags="scaletest,runner,${tags}"
+	fi
+	if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
+		tags="${tags},${GRAFANA_EXTRA_TAGS}"
+	fi
+
+	if [[ ${DRY_RUN} == 1 ]]; then
+		log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]"
+		return 0
+	fi
+
+	if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then
+		log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..."
+		return 0
+	fi
+
+	log "Annotating Grafana (end=${end}): ${text} [${tags}]"
+
+	json="$(
+		jq \
+			--argjson timeEnd "${end}" \
+			'{timeEnd: $timeEnd}' <<<'{}'
+	)"
+	if [[ ${DRY_RUN} == 1 ]]; then
+		log "Would have patched Grafana annotation: id=${id}, data=${json}"
+		return 0
+	fi
+	if ! resp="$(
+		curl -sSL \
+			--insecure \
+			-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
+			-H "Content-Type: application/json" \
+			-X PATCH \
+			-d "${json}" \
+			"${GRAFANA_URL}/api/annotations/${id}"
+	)"; then
+		# Don't abort scaletest just because we couldn't annotate Grafana.
+		log "Failed to annotate Grafana end: ${resp}"
+		return 0
+	fi
+
+	if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then
+		log "Failed to annotate Grafana end: ${resp}"
+		return 0
+	fi
+
+	log "Grafana annotation patched!"
+}
+
 wait_baseline() {
 	s=${1:-2}
-	start_phase "Waiting ${s}m to establish baseline"
+	PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline"
 	maybedryrun "$DRY_RUN" sleep $((s * 60))
-	end_phase
+	PHASE_TYPE="phase-wait" end_phase
 }
--- a/scaletest/templates/scaletest-runner/scripts/prepare.sh
+++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh
@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do
 	fi
 done

-log "Cloning coder/coder repo..."
-
-if [[ ! -d "${HOME}/coder" ]]; then
-	git clone https://github.com/coder/coder.git "${HOME}/coder"
-fi
-(cd "${HOME}/coder" && git pull)
-
 log "Creating coder CLI token (needed for cleanup during shutdown)..."

 mkdir -p "${CODER_CONFIG_DIR}"
--- a/scaletest/templates/scaletest-runner/scripts/report.sh
+++ b/scaletest/templates/scaletest-runner/scripts/report.sh
@ -0,0 +1,104 @@
+#!/bin/bash
+set -euo pipefail
+
+[[ $VERBOSE == 1 ]] && set -x
+
+status=$1
+shift
+
+case "${status}" in
+started) ;;
+completed) ;;
+failed) ;;
+*)
+	echo "Unknown status: ${status}" >&2
+	exit 1
+	;;
+esac
+
+# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
+. "${SCRIPTS_DIR}/lib.sh"
+
+# NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`.
+# https://github.com/coder/coder/issues/9877
+CODER_URL="${CODER_URL%/}"
+buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")"
+server_version="$(jq -r '.version' <<<"${buildinfo}")"
+server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
+
+# Since `coder show` doesn't support JSON output, we list the workspaces instead.
+workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
+owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
+workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
+initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
+
+bullet='•'
+app_urls_raw="$(jq -r '.latest_build.resources[].agents[]?.apps | map(select(.external == true)) | .[] | .display_name, .url' <<<"${workspace_json}")"
+app_urls=()
+while read -r app_name; do
+	read -r app_url
+	bold=
+	if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then
+		# Update Grafana URL with end stamp and make bold.
+		app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
+		bold='*'
+	fi
+	app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
+done <<<"${app_urls_raw}"
+
+params=()
+header=
+
+case "${status}" in
+started)
+	created_at="$(jq -r '.latest_build.created_at' <<<"${workspace_json}")"
+	params=("${bullet} Options:")
+	while read -r param; do
+		params+=("    ${bullet} ${param}")
+	done <<<"$(jq -r '.latest_build.resources[].agents[]?.environment_variables | to_entries | map(select(.key | startswith("SCALETEST_PARAM_"))) | .[] | "`\(.key)`: `\(.value)`"' <<<"${workspace_json}")"
+
+	header="New scaletest started at \`${created_at}\` by \`${initiator_name}\` on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
+	;;
+completed)
+	completed_at=$(date -Iseconds)
+	header="Scaletest completed at \`${completed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
+	;;
+failed)
+	failed_at=$(date -Iseconds)
+	header="Scaletest failed at \`${failed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
+	;;
+*)
+	echo "Unknown status: ${status}" >&2
+	exit 1
+	;;
+esac
+
+text_arr=(
+	"${header}"
+	""
+	"${bullet} Workspace (runner): ${CODER_URL}/@${owner_name}/${workspace_name}"
+	"${bullet} Run ID: ${SCALETEST_RUN_ID}"
+	"${app_urls[@]}"
+	"${params[@]}"
+)
+
+text=
+for field in "${text_arr[@]}"; do
+	text+="${field}"$'\n'
+done
+
+json=$(
+	jq -n --arg text "${text}" '{
+		blocks: [
+			{
+				"type": "section",
+				"text": {
+					"type": "mrkdwn",
+					"text": $text
+				}
+			}
+		]
+	}'
+)
+
+maybedryrun "${DRY_RUN}" curl -X POST -H 'Content-type: application/json' --data "${json}" "${SLACK_WEBHOOK_URL}"
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@ -6,54 +6,61 @@ set -euo pipefail
 # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
 . "${SCRIPTS_DIR}/lib.sh"

+mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}")
+export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}")
+
 log "Running scaletest..."
 set_status Running

 start_phase "Creating workspaces"
 coder exp scaletest create-workspaces \
-	--count "${SCALETEST_NUM_WORKSPACES}" \
-	--template "${SCALETEST_TEMPLATE}" \
-	--concurrency "${SCALETEST_CREATE_CONCURRENCY}" \
-	--job-timeout 15m \
+	--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
+	--template "${SCALETEST_PARAM_TEMPLATE}" \
+	--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
+	--job-timeout 2h \
 	--no-cleanup \
 	--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
 show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
 end_phase

-wait_baseline 5
+wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"

-start_phase "SSH traffic"
-coder exp scaletest workspace-traffic \
-	--ssh \
-	--bytes-per-tick 10240 \
-	--tick-interval 1s \
-	--timeout 5m \
-	--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
-show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
-end_phase
+for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
+	start_phase "Load scenario: ${scenario}"
+	case "${scenario}" in
+	"SSH Traffic")
+		coder exp scaletest workspace-traffic \
+			--ssh \
+			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
+			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \
+			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
+			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
+			--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+		show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+		;;
+	"Web Terminal Traffic")
+		coder exp scaletest workspace-traffic \
+			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
+			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
+			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
+			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
+			--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
+		show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
+		;;
+	"Dashboard Traffic")
+		coder exp scaletest dashboard \
+			--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
+			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
+			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
+			--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
+			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
+		show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+		;;
+	esac
+	end_phase

-wait_baseline 5
-
-start_phase "ReconnectingPTY traffic"
-coder exp scaletest workspace-traffic \
-	--bytes-per-tick 10240 \
-	--tick-interval 1s \
-	--timeout 5m \
-	--output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
-show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
-end_phase
-
-wait_baseline 5
-
-start_phase "Dashboard traffic"
-coder exp scaletest dashboard \
-	--count "${SCALETEST_NUM_WORKSPACES}" \
-	--job-timeout 5m \
-	--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
-show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
-end_phase
-
-wait_baseline 5
+	wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
+done

 log "Scaletest complete!"
 set_status Complete
--- a/scaletest/templates/scaletest-runner/shutdown.sh
+++ b/scaletest/templates/scaletest-runner/shutdown.sh
@ -11,4 +11,8 @@ cleanup() {
 }
 trap cleanup EXIT

+annotate_grafana "workspace" "Agent stopping..."
+
 "${SCRIPTS_DIR}/cleanup.sh" shutdown
+
+annotate_grafana_end "workspace" "Agent running"
--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@ -12,41 +12,63 @@ mkdir -p "${SCRIPTS_DIR}"
 unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}"
 rm /tmp/scripts.zip

+echo "Cloning coder/coder repo..."
+if [[ ! -d "${HOME}/coder" ]]; then
+	git clone https://github.com/coder/coder.git "${HOME}/coder"
+fi
+(cd "${HOME}/coder" && git pull)
+
 # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
 . "${SCRIPTS_DIR}/lib.sh"

+annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
+
 # Show failure in the UI if script exits with error.
 failed_status=Failed
 on_exit() {
 	trap - ERR EXIT

-	case "${SCALETEST_CLEANUP_STRATEGY}" in
+	case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
 	on_stop)
 		# Handled by shutdown script.
 		;;
 	on_success)
 		if [[ $(get_status) != "${failed_status}" ]]; then
-			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
 		fi
 		;;
 	on_error)
 		if [[ $(get_status) = "${failed_status}" ]]; then
-			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
 		fi
 		;;
 	*)
-		"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
+		"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
 		;;
 	esac
+
+	annotate_grafana_end "" "Start scaletest"
 }
 trap on_exit EXIT

 on_err() {
+	code=${?}
+	trap - ERR
+	set +e
+
 	log "Scaletest failed!"
-	set_status "${failed_status}"
+	GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
+	"${SCRIPTS_DIR}/report.sh" failed
 	lock_status # Ensure we never rewrite the status after a failure.
 }
 trap on_err ERR

+# Pass session token since `prepare.sh` has not yet run.
+CODER_SESSION_TOKEN=$CODER_USER_TOKEN "${SCRIPTS_DIR}/report.sh" started
+annotate_grafana "" "Start scaletest"
+
 "${SCRIPTS_DIR}/prepare.sh"
+
 "${SCRIPTS_DIR}/run.sh"
+
+"${SCRIPTS_DIR}/report.sh" completed