feat(scaletest/templates): add support for concurrent scenarios (#11753)

This commit is contained in:
Mathias Fredriksson 2024-01-30 14:54:54 +02:00 committed by GitHub
parent 4b27c77969
commit 83eea2d323
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 460 additions and 80 deletions

View File

@ -12,11 +12,12 @@ terraform {
}
resource "time_static" "start_time" {
# We con't set `count = data.coder_workspace.me.start_count` here because then
# we can't use this value in `locals`. The permission check is recreated on
# start, which will update the timestamp.
# We don't set `count = data.coder_workspace.me.start_count` here because then
# we can't use this value in `locals`, but we want to trigger recreation when
# the scaletest is restarted.
triggers = {
count : length(null_resource.permission_check)
count : data.coder_workspace.me.start_count
token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
}
}
@ -39,8 +40,6 @@ locals {
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
service_account_name = "scaletest-sa"
cpu = 16
memory = 64
home_disk_size = 10
scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
}
}
data "coder_parameter" "cleanup_prepare" {
order = 14
type = "bool"
name = "Cleanup before scaletest"
default = true
description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
mutable = true
ephemeral = true
}
data "coder_parameter" "workspace_template" {
order = 20
@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
}
}
data "coder_parameter" "skip_create_workspaces" {
order = 22
type = "bool"
name = "DEBUG: Skip creating workspaces"
default = false
description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
mutable = true
}
data "coder_parameter" "load_scenarios" {
order = 22
order = 23
name = "Load Scenarios"
type = "list(string)"
description = "The load scenarios to run."
@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
default = jsonencode([
"SSH Traffic",
"Web Terminal Traffic",
"App Traffic",
"Dashboard Traffic",
])
}
data "coder_parameter" "load_scenario_run_concurrently" {
order = 24
name = "Run Load Scenarios Concurrently"
type = "bool"
default = false
description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
mutable = true
}
data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
order = 25
name = "Load Scenario Concurrency Stagger Delay"
type = "number"
default = 3
description = "The number of minutes to wait between starting each load scenario when run concurrently."
mutable = true
}
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
order = 23
order = 30
name = "SSH Traffic Duration"
type = "number"
description = "The duration of the SSH traffic load scenario in minutes."
@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
}
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
order = 24
order = 31
name = "SSH Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the SSH traffic load scenario."
@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
}
data "coder_parameter" "load_scenario_ssh_tick_interval" {
order = 25
order = 32
name = "SSH Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
}
}
data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
order = 33
name = "SSH Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for SSH traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
order = 26
order = 40
name = "Web Terminal Traffic Duration"
type = "number"
description = "The duration of the web terminal traffic load scenario in minutes."
@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
}
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
order = 27
order = 41
name = "Web Terminal Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
}
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
order = 28
order = 42
name = "Web Terminal Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
}
}
data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
order = 43
name = "Web Terminal Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for web terminal traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}
data "coder_parameter" "load_scenario_app_traffic_duration" {
order = 50
name = "App Traffic Duration"
type = "number"
description = "The duration of the app traffic load scenario in minutes."
mutable = true
default = 30
validation {
min = 1
max = 1440 // 24 hours.
}
}
data "coder_parameter" "load_scenario_app_bytes_per_tick" {
order = 51
name = "App Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the app traffic load scenario."
mutable = true
default = 1024
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_app_tick_interval" {
order = 52
name = "App Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the app traffic load scenario."
mutable = true
default = 100
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_app_traffic_percentage" {
order = 53
name = "App Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for app traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}
data "coder_parameter" "load_scenario_app_traffic_mode" {
order = 54
name = "App Traffic Mode"
default = "wsec"
description = "The mode of the app traffic load scenario."
mutable = true
option {
name = "WebSocket Echo"
value = "wsec"
description = "Send traffic to the workspace via the app websocket and read it back."
}
option {
name = "WebSocket Read (Random)"
value = "wsra"
description = "Read traffic from the workspace via the app websocket."
}
option {
name = "WebSocket Write (Discard)"
value = "wsdi"
description = "Send traffic to the workspace via the app websocket."
}
}
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
order = 29
order = 60
name = "Dashboard Traffic Duration"
type = "number"
description = "The duration of the dashboard traffic load scenario in minutes."
@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
}
}
data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
order = 61
name = "Dashboard Traffic Percentage"
type = "number"
description = "The percentage of users that should be targeted for dashboard traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}
data "coder_parameter" "load_scenario_baseline_duration" {
order = 26
order = 100
name = "Baseline Wait Duration"
type = "number"
description = "The duration to wait before starting a load scenario in minutes."
@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
}
data "coder_parameter" "greedy_agent" {
order = 30
order = 200
type = "bool"
name = "Greedy Agent"
default = false
@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
}
data "coder_parameter" "greedy_agent_template" {
order = 31
order = 201
name = "Greedy Agent Template"
display_name = "Greedy Agent Template"
description = "The template used for the greedy agent workspace (must not be same as workspace template)."
@ -432,6 +581,7 @@ resource "coder_agent" "main" {
SCALETEST_RUN_ID : local.scaletest_run_id,
SCALETEST_RUN_DIR : local.scaletest_run_dir,
SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
SCALETEST_PROMETHEUS_START_PORT : "21112",
# Comment is a scaletest param, but we want to surface it separately from
# the rest, so we use a different name.
@ -440,16 +590,28 @@ resource "coder_agent" "main" {
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
}
}
resources {
# Set requests and limits values such that we can do performant
# execution of `coder scaletest` commands.
requests = {
"cpu" = "250m"
"memory" = "512Mi"
}
limits = {
"cpu" = "${local.cpu}"
"memory" = "${local.memory}Gi"
}
}
volume_mount {
mount_path = "/home/coder"
name = "home"
read_only = false
}
port {
container_port = 21112
name = "prometheus-http"
protocol = "TCP"
dynamic "port" {
for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
iterator = it
content {
container_port = 21112 + it.key
name = "prom-http${it.key}"
protocol = "TCP"
}
}
}
@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
}
}
podMetricsEndpoints = [
{
port = "prometheus-http"
# NOTE(mafredri): We could add more information here by including the
# scenario name in the port name (although it's limited to 15 chars so
# it needs to be short). That said, someone looking at the stats can
# assume that there's a 1-to-1 mapping between scenario# and port.
for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
port = "prom-http${i}"
interval = "15s"
}
]

View File

@ -12,29 +12,51 @@ if [[ -z $event ]]; then
event=manual
fi
if [[ $event = manual ]]; then
do_cleanup() {
start_phase "Cleanup (${event})"
coder exp scaletest cleanup \
--cleanup-job-timeout 2h \
--cleanup-timeout 5h |
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
end_phase
}
do_scaledown() {
start_phase "Scale down provisioners (${event})"
maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
end_phase
}
case "${event}" in
manual)
echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
read -r -n 1
if [[ $REPLY != [yY] ]]; then
echo $'\nAborting...'
exit 1
fi
fi
echo
start_phase "Cleanup (${event})"
coder exp scaletest cleanup \
--cleanup-job-timeout 2h \
--cleanup-timeout 5h |
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
end_phase
do_cleanup
do_scaledown
if [[ $event != prepare ]]; then
start_phase "Scaling down provisioners..."
maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
fi
if [[ $event = manual ]]; then
echo 'Press any key to continue...'
read -s -r -n 1
fi
;;
prepare)
do_cleanup
;;
on_stop) ;; # Do nothing, handled by "shutdown".
always | on_success | on_error | shutdown)
do_cleanup
do_scaledown
;;
shutdown_scale_down_only)
do_scaledown
;;
*)
echo "Unknown event: ${event}" >&2
exit 1
;;
esac

View File

@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
log "Cleaning up from previous runs (if applicable)..."
"${SCRIPTS_DIR}/cleanup.sh" "prepare"
if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then
log "Cleaning up from previous runs (if applicable)..."
"${SCRIPTS_DIR}/cleanup.sh" prepare
fi
log "Preparation complete!"

View File

@ -13,15 +13,21 @@ log "Running scaletest..."
set_status Running
start_phase "Creating workspaces"
coder exp scaletest create-workspaces \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--timeout 5h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
# Note that we allow up to 5 failures to bring up the workspace, since
# we're creating a lot of workspaces at once and some of them may fail
# due to network issues or other transient errors.
coder exp scaletest create-workspaces \
--retry 5 \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--timeout 5h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
fi
end_phase
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
@ -86,20 +92,60 @@ else
fi
annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic"
return ${status}
return "${status}"
}
fi
run_scenario_cmd() {
local scenario=${1}
shift
local command=("$@")
set +e
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
annotate_grafana scenario "Load scenario: ${scenario}"
fi
"${command[@]}"
status=${?}
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
export GRAFANA_ADD_TAGS=
if [[ ${status} != 0 ]]; then
GRAFANA_ADD_TAGS=error
fi
annotate_grafana_end scenario "Load scenario: ${scenario}"
fi
exit "${status}"
}
declare -a pids=()
declare -A pid_to_scenario=()
declare -A failed=()
target_start=0
target_end=-1
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
fi
for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
start_phase "Load scenario: ${scenario}"
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
start_phase "Load scenario: ${scenario}"
fi
set +e
status=0
case "${scenario}" in
"SSH Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" &
coder exp scaletest workspace-traffic \
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--ssh \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
@ -107,55 +153,160 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \
"${non_greedy_agent_traffic_args[@]}"
status=$?
wait
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
;;
"Web Terminal Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" &
coder exp scaletest workspace-traffic \
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \
"${non_greedy_agent_traffic_args[@]}"
status=$?
wait
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
;;
"App Traffic")
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" &
greedy_agent_traffic_pid=$!
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \
--target-workspaces "${target_start}:${target_end}" \
"${non_greedy_agent_traffic_args[@]}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
wait "${greedy_agent_traffic_pid}"
status2=$?
if [[ ${status} == 0 ]]; then
status=${status2}
fi
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
;;
"Dashboard Traffic")
coder exp scaletest dashboard \
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
target_end=$((target_start + target_count))
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
target_start=0
target_end=${target_count}
fi
# TODO: Remove this once the dashboard traffic command is fixed,
# (i.e. once images are no longer dumped into PWD).
mkdir -p dashboard
pushd dashboard
run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
--target-users "${target_start}:${target_end}" \
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
pids+=($!)
popd
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
# Debug scenarios, for testing the runner.
"debug:greedy_agent_traffic")
greedy_agent_traffic 10 "${scenario}"
status=$?
greedy_agent_traffic 10 "${scenario}" &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
"debug:success")
maybedryrun "$DRY_RUN" sleep 10
status=0
{
maybedryrun "$DRY_RUN" sleep 10
true
} &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
"debug:error")
maybedryrun "$DRY_RUN" sleep 10
status=1
{
maybedryrun "$DRY_RUN" sleep 10
false
} &
pids+=($!)
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
wait "${pids[-1]}"
status=$?
else
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
fi
;;
*)
@ -163,9 +314,22 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
;;
esac
set -e
# Allow targeting to be distributed evenly across workspaces when each
# scenario is run concurrently and all percentages add up to 100.
target_start=${target_end}
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
# Stagger the start of each scenario to avoid a burst of load and deted
# problematic scenarios.
sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60))
continue
fi
if ((status > 0)); then
log "Load scenario failed: ${scenario} (exit=${status})"
failed+=(["${scenario}"]="$status")
failed+=(["${scenario}"]="${status}")
PHASE_ADD_TAGS=error end_phase
else
end_phase
@ -173,6 +337,25 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
done
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
wait "${pids[@]}"
# Wait on all pids will wait until all have exited, but we need to
# check their individual exit codes.
for pid in "${pids[@]}"; do
wait "${pid}"
status=${?}
scenario=${pid_to_scenario[${pid}]}
if ((status > 0)); then
log "Load scenario failed: ${scenario} (exit=${status})"
failed+=(["${scenario}"]="${status}")
fi
done
if ((${#failed[@]} > 0)); then
PHASE_ADD_TAGS=error end_phase
else
end_phase
fi
fi
if ((${#failed[@]} > 0)); then
log "Load scenarios failed: ${!failed[*]}"

View File

@ -14,7 +14,11 @@ trap cleanup EXIT
annotate_grafana "workspace" "Agent stopping..."
"${SCRIPTS_DIR}/cleanup.sh" shutdown
shutdown_event=shutdown_scale_down_only
if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then
shutdown_event=shutdown
fi
"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}"
annotate_grafana_end "workspace" "Agent running"

View File

@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}"
exit 1
fi
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then
echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2
exit 1
fi
# Unzip scripts and add to path.
# shellcheck disable=SC2153
echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."