feat(scaletest/templates): add support for concurrent scenarios (#11753)

2024-01-30 14:54:54 +02:00 · 2024-01-30 14:54:54 +02:00 · 83eea2d323
parent 4b27c77969
commit 83eea2d323
6 changed files with 460 additions and 80 deletions
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@ -12,11 +12,12 @@ terraform {
 }

 resource "time_static" "start_time" {
-  # We con't set `count = data.coder_workspace.me.start_count` here because then
-  # we can't use this value in `locals`. The permission check is recreated on
-  # start, which will update the timestamp.
+  # We don't set `count = data.coder_workspace.me.start_count` here because then
+  # we can't use this value in `locals`, but we want to trigger recreation when
+  # the scaletest is restarted.
  triggers = {
-    count : length(null_resource.permission_check)
+    count : data.coder_workspace.me.start_count
+    token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
  }
 }

@ -39,8 +40,6 @@ locals {
  workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
  workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
  service_account_name                           = "scaletest-sa"
-  cpu                                            = 16
-  memory                                         = 64
  home_disk_size                                 = 10
  scaletest_run_id                               = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
  scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"
@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
  }
 }

+data "coder_parameter" "cleanup_prepare" {
+  order       = 14
+  type        = "bool"
+  name        = "Cleanup before scaletest"
+  default     = true
+  description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
+  mutable     = true
+  ephemeral   = true
+}
+

 data "coder_parameter" "workspace_template" {
  order        = 20
@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
  }
 }

+data "coder_parameter" "skip_create_workspaces" {
+  order       = 22
+  type        = "bool"
+  name        = "DEBUG: Skip creating workspaces"
+  default     = false
+  description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
+  mutable     = true
+}
+

 data "coder_parameter" "load_scenarios" {
-  order       = 22
+  order       = 23
  name        = "Load Scenarios"
  type        = "list(string)"
  description = "The load scenarios to run."
@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
  default = jsonencode([
    "SSH Traffic",
    "Web Terminal Traffic",
+    "App Traffic",
    "Dashboard Traffic",
  ])
 }

+data "coder_parameter" "load_scenario_run_concurrently" {
+  order       = 24
+  name        = "Run Load Scenarios Concurrently"
+  type        = "bool"
+  default     = false
+  description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
+  mutable     = true
+}
+
+data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
+  order       = 25
+  name        = "Load Scenario Concurrency Stagger Delay"
+  type        = "number"
+  default     = 3
+  description = "The number of minutes to wait between starting each load scenario when run concurrently."
+  mutable     = true
+}
+
 data "coder_parameter" "load_scenario_ssh_traffic_duration" {
-  order       = 23
+  order       = 30
  name        = "SSH Traffic Duration"
  type        = "number"
  description = "The duration of the SSH traffic load scenario in minutes."
@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
 }

 data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
-  order       = 24
+  order       = 31
  name        = "SSH Bytes Per Tick"
  type        = "number"
  description = "The number of bytes to send per tick in the SSH traffic load scenario."
@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
 }

 data "coder_parameter" "load_scenario_ssh_tick_interval" {
-  order       = 25
+  order       = 32
  name        = "SSH Tick Interval"
  type        = "number"
  description = "The number of milliseconds between each tick in the SSH traffic load scenario."
@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
  }
 }

+data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
+  order       = 33
+  name        = "SSH Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for SSH traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
-  order       = 26
+  order       = 40
  name        = "Web Terminal Traffic Duration"
  type        = "number"
  description = "The duration of the web terminal traffic load scenario in minutes."
@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
 }

 data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
-  order       = 27
+  order       = 41
  name        = "Web Terminal Bytes Per Tick"
  type        = "number"
  description = "The number of bytes to send per tick in the web terminal traffic load scenario."
@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
 }

 data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
-  order       = 28
+  order       = 42
  name        = "Web Terminal Tick Interval"
  type        = "number"
  description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
  }
 }

+data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
+  order       = 43
+  name        = "Web Terminal Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for web terminal traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_duration" {
+  order       = 50
+  name        = "App Traffic Duration"
+  type        = "number"
+  description = "The duration of the app traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_app_bytes_per_tick" {
+  order       = 51
+  name        = "App Bytes Per Tick"
+  type        = "number"
+  description = "The number of bytes to send per tick in the app traffic load scenario."
+  mutable     = true
+  default     = 1024
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_tick_interval" {
+  order       = 52
+  name        = "App Tick Interval"
+  type        = "number"
+  description = "The number of milliseconds between each tick in the app traffic load scenario."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_percentage" {
+  order       = 53
+  name        = "App Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for app traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_mode" {
+  order       = 54
+  name        = "App Traffic Mode"
+  default     = "wsec"
+  description = "The mode of the app traffic load scenario."
+  mutable     = true
+  option {
+    name        = "WebSocket Echo"
+    value       = "wsec"
+    description = "Send traffic to the workspace via the app websocket and read it back."
+  }
+  option {
+    name        = "WebSocket Read (Random)"
+    value       = "wsra"
+    description = "Read traffic from the workspace via the app websocket."
+  }
+  option {
+    name        = "WebSocket Write (Discard)"
+    value       = "wsdi"
+    description = "Send traffic to the workspace via the app websocket."
+  }
+}
+
 data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
-  order       = 29
+  order       = 60
  name        = "Dashboard Traffic Duration"
  type        = "number"
  description = "The duration of the dashboard traffic load scenario in minutes."
@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
  }
 }

+data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
+  order       = 61
+  name        = "Dashboard Traffic Percentage"
+  type        = "number"
+  description = "The percentage of users that should be targeted for dashboard traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_baseline_duration" {
-  order       = 26
+  order       = 100
  name        = "Baseline Wait Duration"
  type        = "number"
  description = "The duration to wait before starting a load scenario in minutes."
@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
 }

 data "coder_parameter" "greedy_agent" {
-  order       = 30
+  order       = 200
  type        = "bool"
  name        = "Greedy Agent"
  default     = false
@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
 }

 data "coder_parameter" "greedy_agent_template" {
-  order        = 31
+  order        = 201
  name         = "Greedy Agent Template"
  display_name = "Greedy Agent Template"
  description  = "The template used for the greedy agent workspace (must not be same as workspace template)."
@ -432,6 +581,7 @@ resource "coder_agent" "main" {
    SCALETEST_RUN_ID : local.scaletest_run_id,
    SCALETEST_RUN_DIR : local.scaletest_run_dir,
    SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
+    SCALETEST_PROMETHEUS_START_PORT : "21112",

    # Comment is a scaletest param, but we want to surface it separately from
    # the rest, so we use a different name.
@ -440,16 +590,28 @@ resource "coder_agent" "main" {
    SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
    SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
    SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
+    SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
    SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
    SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+    SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
    SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
+    SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
+    SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
    SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
    SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
    SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
    SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
        }
      }
      resources {
-        # Set requests and limits values such that we can do performant
-        # execution of `coder scaletest` commands.
        requests = {
          "cpu"    = "250m"
          "memory" = "512Mi"
        }
-        limits = {
-          "cpu"    = "${local.cpu}"
-          "memory" = "${local.memory}Gi"
-        }
      }
      volume_mount {
        mount_path = "/home/coder"
        name       = "home"
        read_only  = false
      }
-      port {
-        container_port = 21112
-        name           = "prometheus-http"
-        protocol       = "TCP"
+      dynamic "port" {
+        for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
+        iterator = it
+        content {
+          container_port = 21112 + it.key
+          name           = "prom-http${it.key}"
+          protocol       = "TCP"
+        }
      }
    }

@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
        }
      }
      podMetricsEndpoints = [
-        {
-          port     = "prometheus-http"
+        # NOTE(mafredri): We could add more information here by including the
+        # scenario name in the port name (although it's limited to 15 chars so
+        # it needs to be short). That said, someone looking at the stats can
+        # assume that there's a 1-to-1 mapping between scenario# and port.
+        for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
+          port     = "prom-http${i}"
          interval = "15s"
        }
      ]
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@ -12,29 +12,51 @@ if [[ -z $event ]]; then
 	event=manual
 fi

-if [[ $event = manual ]]; then
+do_cleanup() {
+	start_phase "Cleanup (${event})"
+	coder exp scaletest cleanup \
+		--cleanup-job-timeout 2h \
+		--cleanup-timeout 5h |
+		tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
+	end_phase
+}
+
+do_scaledown() {
+	start_phase "Scale down provisioners (${event})"
+	maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
+	maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
+	end_phase
+}
+
+case "${event}" in
+manual)
 	echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
 	read -r -n 1
 	if [[ $REPLY != [yY] ]]; then
 		echo $'\nAborting...'
 		exit 1
 	fi
-fi
+	echo

-start_phase "Cleanup (${event})"
-coder exp scaletest cleanup \
-	--cleanup-job-timeout 2h \
-	--cleanup-timeout 5h |
-	tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
-end_phase
+	do_cleanup
+	do_scaledown

-if [[ $event != prepare ]]; then
-	start_phase "Scaling down provisioners..."
-	maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
-	maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
-fi
-
-if [[ $event = manual ]]; then
 	echo 'Press any key to continue...'
 	read -s -r -n 1
-fi
+	;;
+prepare)
+	do_cleanup
+	;;
+on_stop) ;; # Do nothing, handled by "shutdown".
+always | on_success | on_error | shutdown)
+	do_cleanup
+	do_scaledown
+	;;
+shutdown_scale_down_only)
+	do_scaledown
+	;;
+*)
+	echo "Unknown event: ${event}" >&2
+	exit 1
+	;;
+esac
--- a/scaletest/templates/scaletest-runner/scripts/prepare.sh
+++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh
@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN
 echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
 [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).

-log "Cleaning up from previous runs (if applicable)..."
-"${SCRIPTS_DIR}/cleanup.sh" "prepare"
+if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then
+	log "Cleaning up from previous runs (if applicable)..."
+	"${SCRIPTS_DIR}/cleanup.sh" prepare
+fi

 log "Preparation complete!"

--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@ -13,15 +13,21 @@ log "Running scaletest..."
 set_status Running

 start_phase "Creating workspaces"
-coder exp scaletest create-workspaces \
-	--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
-	--template "${SCALETEST_PARAM_TEMPLATE}" \
-	--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
-	--timeout 5h \
-	--job-timeout 5h \
-	--no-cleanup \
-	--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
-show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
+	# Note that we allow up to 5 failures to bring up the workspace, since
+	# we're creating a lot of workspaces at once and some of them may fail
+	# due to network issues or other transient errors.
+	coder exp scaletest create-workspaces \
+		--retry 5 \
+		--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
+		--template "${SCALETEST_PARAM_TEMPLATE}" \
+		--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
+		--timeout 5h \
+		--job-timeout 5h \
+		--no-cleanup \
+		--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+	show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+fi
 end_phase

 wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
@ -86,20 +92,60 @@ else
 		fi
 		annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic"

-		return ${status}
+		return "${status}"
 	}
 fi

+run_scenario_cmd() {
+	local scenario=${1}
+	shift
+	local command=("$@")
+
+	set +e
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+		annotate_grafana scenario "Load scenario: ${scenario}"
+	fi
+	"${command[@]}"
+	status=${?}
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+		export GRAFANA_ADD_TAGS=
+		if [[ ${status} != 0 ]]; then
+			GRAFANA_ADD_TAGS=error
+		fi
+		annotate_grafana_end scenario "Load scenario: ${scenario}"
+	fi
+	exit "${status}"
+}
+
+declare -a pids=()
+declare -A pid_to_scenario=()
 declare -A failed=()
+target_start=0
+target_end=-1
+
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+	start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
+fi
 for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
-	start_phase "Load scenario: ${scenario}"
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+		start_phase "Load scenario: ${scenario}"
+	fi

 	set +e
 	status=0
 	case "${scenario}" in
 	"SSH Traffic")
 		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" &
-		coder exp scaletest workspace-traffic \
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--ssh \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
@ -107,55 +153,160 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \
-			"${non_greedy_agent_traffic_args[@]}"
-		status=$?
-		wait
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
 		status2=$?
 		if [[ ${status} == 0 ]]; then
 			status=${status2}
 		fi
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
 		;;
 	"Web Terminal Traffic")
 		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" &
-		coder exp scaletest workspace-traffic \
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
 			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \
-			"${non_greedy_agent_traffic_args[@]}"
-		status=$?
-		wait
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
+		status2=$?
+		if [[ ${status} == 0 ]]; then
+			status=${status2}
+		fi
+		;;
+	"App Traffic")
+		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" &
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
+			--template "${SCALETEST_PARAM_TEMPLATE}" \
+			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
+			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
+			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \
+			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \
+			--output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
 		status2=$?
 		if [[ ${status} == 0 ]]; then
 			status=${status2}
 		fi
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
 		;;
 	"Dashboard Traffic")
-		coder exp scaletest dashboard \
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
+		# TODO: Remove this once the dashboard traffic command is fixed,
+		# (i.e. once images are no longer dumped into PWD).
+		mkdir -p dashboard
+		pushd dashboard
+		run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
-			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
-		status=$?
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-users "${target_start}:${target_end}" \
+			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
+		pids+=($!)
+		popd
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;

 	# Debug scenarios, for testing the runner.
 	"debug:greedy_agent_traffic")
-		greedy_agent_traffic 10 "${scenario}"
-		status=$?
+		greedy_agent_traffic 10 "${scenario}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 	"debug:success")
-		maybedryrun "$DRY_RUN" sleep 10
-		status=0
+		{
+			maybedryrun "$DRY_RUN" sleep 10
+			true
+		} &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 	"debug:error")
-		maybedryrun "$DRY_RUN" sleep 10
-		status=1
+		{
+			maybedryrun "$DRY_RUN" sleep 10
+			false
+		} &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;

 	*)
@ -163,9 +314,22 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 		;;
 	esac
 	set -e
+
+	# Allow targeting to be distributed evenly across workspaces when each
+	# scenario is run concurrently and all percentages add up to 100.
+	target_start=${target_end}
+
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+		pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
+		# Stagger the start of each scenario to avoid a burst of load and deted
+		# problematic scenarios.
+		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60))
+		continue
+	fi
+
 	if ((status > 0)); then
 		log "Load scenario failed: ${scenario} (exit=${status})"
-		failed+=(["${scenario}"]="$status")
+		failed+=(["${scenario}"]="${status}")
 		PHASE_ADD_TAGS=error end_phase
 	else
 		end_phase
@ -173,6 +337,25 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do

 	wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
 done
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+	wait "${pids[@]}"
+	# Wait on all pids will wait until all have exited, but we need to
+	# check their individual exit codes.
+	for pid in "${pids[@]}"; do
+		wait "${pid}"
+		status=${?}
+		scenario=${pid_to_scenario[${pid}]}
+		if ((status > 0)); then
+			log "Load scenario failed: ${scenario} (exit=${status})"
+			failed+=(["${scenario}"]="${status}")
+		fi
+	done
+	if ((${#failed[@]} > 0)); then
+		PHASE_ADD_TAGS=error end_phase
+	else
+		end_phase
+	fi
+fi

 if ((${#failed[@]} > 0)); then
 	log "Load scenarios failed: ${!failed[*]}"
--- a/scaletest/templates/scaletest-runner/shutdown.sh
+++ b/scaletest/templates/scaletest-runner/shutdown.sh
@ -14,7 +14,11 @@ trap cleanup EXIT

 annotate_grafana "workspace" "Agent stopping..."

-"${SCRIPTS_DIR}/cleanup.sh" shutdown
+shutdown_event=shutdown_scale_down_only
+if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then
+	shutdown_event=shutdown
+fi
+"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}"

 annotate_grafana_end "workspace" "Agent running"

--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}"
 	exit 1
 fi

+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then
+	echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2
+	exit 1
+fi
+
 # Unzip scripts and add to path.
 # shellcheck disable=SC2153
 echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."