mirror of https://github.com/coder/coder.git
feat(scaletest/templates): add support for concurrent scenarios (#11753)
This commit is contained in:
parent
4b27c77969
commit
83eea2d323
|
@ -12,11 +12,12 @@ terraform {
|
|||
}
|
||||
|
||||
resource "time_static" "start_time" {
|
||||
# We con't set `count = data.coder_workspace.me.start_count` here because then
|
||||
# we can't use this value in `locals`. The permission check is recreated on
|
||||
# start, which will update the timestamp.
|
||||
# We don't set `count = data.coder_workspace.me.start_count` here because then
|
||||
# we can't use this value in `locals`, but we want to trigger recreation when
|
||||
# the scaletest is restarted.
|
||||
triggers = {
|
||||
count : length(null_resource.permission_check)
|
||||
count : data.coder_workspace.me.start_count
|
||||
token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -39,8 +40,6 @@ locals {
|
|||
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
|
||||
service_account_name = "scaletest-sa"
|
||||
cpu = 16
|
||||
memory = 64
|
||||
home_disk_size = 10
|
||||
scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
|
||||
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
|
||||
|
@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
|
|||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "cleanup_prepare" {
|
||||
order = 14
|
||||
type = "bool"
|
||||
name = "Cleanup before scaletest"
|
||||
default = true
|
||||
description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
|
||||
mutable = true
|
||||
ephemeral = true
|
||||
}
|
||||
|
||||
|
||||
data "coder_parameter" "workspace_template" {
|
||||
order = 20
|
||||
|
@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
|
|||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "skip_create_workspaces" {
|
||||
order = 22
|
||||
type = "bool"
|
||||
name = "DEBUG: Skip creating workspaces"
|
||||
default = false
|
||||
description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
|
||||
mutable = true
|
||||
}
|
||||
|
||||
|
||||
data "coder_parameter" "load_scenarios" {
|
||||
order = 22
|
||||
order = 23
|
||||
name = "Load Scenarios"
|
||||
type = "list(string)"
|
||||
description = "The load scenarios to run."
|
||||
|
@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
|
|||
default = jsonencode([
|
||||
"SSH Traffic",
|
||||
"Web Terminal Traffic",
|
||||
"App Traffic",
|
||||
"Dashboard Traffic",
|
||||
])
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_run_concurrently" {
|
||||
order = 24
|
||||
name = "Run Load Scenarios Concurrently"
|
||||
type = "bool"
|
||||
default = false
|
||||
description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
|
||||
mutable = true
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
|
||||
order = 25
|
||||
name = "Load Scenario Concurrency Stagger Delay"
|
||||
type = "number"
|
||||
default = 3
|
||||
description = "The number of minutes to wait between starting each load scenario when run concurrently."
|
||||
mutable = true
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
|
||||
order = 23
|
||||
order = 30
|
||||
name = "SSH Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the SSH traffic load scenario in minutes."
|
||||
|
@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
|
||||
order = 24
|
||||
order = 31
|
||||
name = "SSH Bytes Per Tick"
|
||||
type = "number"
|
||||
description = "The number of bytes to send per tick in the SSH traffic load scenario."
|
||||
|
@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_tick_interval" {
|
||||
order = 25
|
||||
order = 32
|
||||
name = "SSH Tick Interval"
|
||||
type = "number"
|
||||
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
|
||||
|
@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
|
|||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
|
||||
order = 33
|
||||
name = "SSH Traffic Percentage"
|
||||
type = "number"
|
||||
description = "The percentage of workspaces that should be targeted for SSH traffic."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
max = 100
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
|
||||
order = 26
|
||||
order = 40
|
||||
name = "Web Terminal Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the web terminal traffic load scenario in minutes."
|
||||
|
@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
|
||||
order = 27
|
||||
order = 41
|
||||
name = "Web Terminal Bytes Per Tick"
|
||||
type = "number"
|
||||
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
|
||||
|
@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
|
||||
order = 28
|
||||
order = 42
|
||||
name = "Web Terminal Tick Interval"
|
||||
type = "number"
|
||||
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
|
||||
|
@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
|
|||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
|
||||
order = 43
|
||||
name = "Web Terminal Traffic Percentage"
|
||||
type = "number"
|
||||
description = "The percentage of workspaces that should be targeted for web terminal traffic."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
max = 100
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_app_traffic_duration" {
|
||||
order = 50
|
||||
name = "App Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the app traffic load scenario in minutes."
|
||||
mutable = true
|
||||
default = 30
|
||||
validation {
|
||||
min = 1
|
||||
max = 1440 // 24 hours.
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_app_bytes_per_tick" {
|
||||
order = 51
|
||||
name = "App Bytes Per Tick"
|
||||
type = "number"
|
||||
description = "The number of bytes to send per tick in the app traffic load scenario."
|
||||
mutable = true
|
||||
default = 1024
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_app_tick_interval" {
|
||||
order = 52
|
||||
name = "App Tick Interval"
|
||||
type = "number"
|
||||
description = "The number of milliseconds between each tick in the app traffic load scenario."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_app_traffic_percentage" {
|
||||
order = 53
|
||||
name = "App Traffic Percentage"
|
||||
type = "number"
|
||||
description = "The percentage of workspaces that should be targeted for app traffic."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
max = 100
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_app_traffic_mode" {
|
||||
order = 54
|
||||
name = "App Traffic Mode"
|
||||
default = "wsec"
|
||||
description = "The mode of the app traffic load scenario."
|
||||
mutable = true
|
||||
option {
|
||||
name = "WebSocket Echo"
|
||||
value = "wsec"
|
||||
description = "Send traffic to the workspace via the app websocket and read it back."
|
||||
}
|
||||
option {
|
||||
name = "WebSocket Read (Random)"
|
||||
value = "wsra"
|
||||
description = "Read traffic from the workspace via the app websocket."
|
||||
}
|
||||
option {
|
||||
name = "WebSocket Write (Discard)"
|
||||
value = "wsdi"
|
||||
description = "Send traffic to the workspace via the app websocket."
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
|
||||
order = 29
|
||||
order = 60
|
||||
name = "Dashboard Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the dashboard traffic load scenario in minutes."
|
||||
|
@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
|
|||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
|
||||
order = 61
|
||||
name = "Dashboard Traffic Percentage"
|
||||
type = "number"
|
||||
description = "The percentage of users that should be targeted for dashboard traffic."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
max = 100
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_baseline_duration" {
|
||||
order = 26
|
||||
order = 100
|
||||
name = "Baseline Wait Duration"
|
||||
type = "number"
|
||||
description = "The duration to wait before starting a load scenario in minutes."
|
||||
|
@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "greedy_agent" {
|
||||
order = 30
|
||||
order = 200
|
||||
type = "bool"
|
||||
name = "Greedy Agent"
|
||||
default = false
|
||||
|
@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
|
|||
}
|
||||
|
||||
data "coder_parameter" "greedy_agent_template" {
|
||||
order = 31
|
||||
order = 201
|
||||
name = "Greedy Agent Template"
|
||||
display_name = "Greedy Agent Template"
|
||||
description = "The template used for the greedy agent workspace (must not be same as workspace template)."
|
||||
|
@ -432,6 +581,7 @@ resource "coder_agent" "main" {
|
|||
SCALETEST_RUN_ID : local.scaletest_run_id,
|
||||
SCALETEST_RUN_DIR : local.scaletest_run_dir,
|
||||
SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
|
||||
SCALETEST_PROMETHEUS_START_PORT : "21112",
|
||||
|
||||
# Comment is a scaletest param, but we want to surface it separately from
|
||||
# the rest, so we use a different name.
|
||||
|
@ -440,16 +590,28 @@ resource "coder_agent" "main" {
|
|||
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
|
||||
SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
|
||||
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
|
||||
SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
|
||||
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
|
||||
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
|
||||
SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
|
||||
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
|
||||
SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
|
||||
SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
|
||||
|
@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
|
|||
}
|
||||
}
|
||||
resources {
|
||||
# Set requests and limits values such that we can do performant
|
||||
# execution of `coder scaletest` commands.
|
||||
requests = {
|
||||
"cpu" = "250m"
|
||||
"memory" = "512Mi"
|
||||
}
|
||||
limits = {
|
||||
"cpu" = "${local.cpu}"
|
||||
"memory" = "${local.memory}Gi"
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
mount_path = "/home/coder"
|
||||
name = "home"
|
||||
read_only = false
|
||||
}
|
||||
port {
|
||||
container_port = 21112
|
||||
name = "prometheus-http"
|
||||
protocol = "TCP"
|
||||
dynamic "port" {
|
||||
for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
|
||||
iterator = it
|
||||
content {
|
||||
container_port = 21112 + it.key
|
||||
name = "prom-http${it.key}"
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
|
|||
}
|
||||
}
|
||||
podMetricsEndpoints = [
|
||||
{
|
||||
port = "prometheus-http"
|
||||
# NOTE(mafredri): We could add more information here by including the
|
||||
# scenario name in the port name (although it's limited to 15 chars so
|
||||
# it needs to be short). That said, someone looking at the stats can
|
||||
# assume that there's a 1-to-1 mapping between scenario# and port.
|
||||
for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
|
||||
port = "prom-http${i}"
|
||||
interval = "15s"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -12,29 +12,51 @@ if [[ -z $event ]]; then
|
|||
event=manual
|
||||
fi
|
||||
|
||||
if [[ $event = manual ]]; then
|
||||
do_cleanup() {
|
||||
start_phase "Cleanup (${event})"
|
||||
coder exp scaletest cleanup \
|
||||
--cleanup-job-timeout 2h \
|
||||
--cleanup-timeout 5h |
|
||||
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
|
||||
end_phase
|
||||
}
|
||||
|
||||
do_scaledown() {
|
||||
start_phase "Scale down provisioners (${event})"
|
||||
maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
|
||||
maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
|
||||
end_phase
|
||||
}
|
||||
|
||||
case "${event}" in
|
||||
manual)
|
||||
echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
|
||||
read -r -n 1
|
||||
if [[ $REPLY != [yY] ]]; then
|
||||
echo $'\nAborting...'
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
echo
|
||||
|
||||
start_phase "Cleanup (${event})"
|
||||
coder exp scaletest cleanup \
|
||||
--cleanup-job-timeout 2h \
|
||||
--cleanup-timeout 5h |
|
||||
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
|
||||
end_phase
|
||||
do_cleanup
|
||||
do_scaledown
|
||||
|
||||
if [[ $event != prepare ]]; then
|
||||
start_phase "Scaling down provisioners..."
|
||||
maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
|
||||
maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
|
||||
fi
|
||||
|
||||
if [[ $event = manual ]]; then
|
||||
echo 'Press any key to continue...'
|
||||
read -s -r -n 1
|
||||
fi
|
||||
;;
|
||||
prepare)
|
||||
do_cleanup
|
||||
;;
|
||||
on_stop) ;; # Do nothing, handled by "shutdown".
|
||||
always | on_success | on_error | shutdown)
|
||||
do_cleanup
|
||||
do_scaledown
|
||||
;;
|
||||
shutdown_scale_down_only)
|
||||
do_scaledown
|
||||
;;
|
||||
*)
|
||||
echo "Unknown event: ${event}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN
|
|||
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
|
||||
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
|
||||
|
||||
log "Cleaning up from previous runs (if applicable)..."
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "prepare"
|
||||
if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then
|
||||
log "Cleaning up from previous runs (if applicable)..."
|
||||
"${SCRIPTS_DIR}/cleanup.sh" prepare
|
||||
fi
|
||||
|
||||
log "Preparation complete!"
|
||||
|
||||
|
|
|
@ -13,15 +13,21 @@ log "Running scaletest..."
|
|||
set_status Running
|
||||
|
||||
start_phase "Creating workspaces"
|
||||
coder exp scaletest create-workspaces \
|
||||
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
|
||||
--timeout 5h \
|
||||
--job-timeout 5h \
|
||||
--no-cleanup \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
|
||||
# Note that we allow up to 5 failures to bring up the workspace, since
|
||||
# we're creating a lot of workspaces at once and some of them may fail
|
||||
# due to network issues or other transient errors.
|
||||
coder exp scaletest create-workspaces \
|
||||
--retry 5 \
|
||||
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
|
||||
--timeout 5h \
|
||||
--job-timeout 5h \
|
||||
--no-cleanup \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
fi
|
||||
end_phase
|
||||
|
||||
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
|
||||
|
@ -86,20 +92,60 @@ else
|
|||
fi
|
||||
annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic"
|
||||
|
||||
return ${status}
|
||||
return "${status}"
|
||||
}
|
||||
fi
|
||||
|
||||
run_scenario_cmd() {
|
||||
local scenario=${1}
|
||||
shift
|
||||
local command=("$@")
|
||||
|
||||
set +e
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
|
||||
annotate_grafana scenario "Load scenario: ${scenario}"
|
||||
fi
|
||||
"${command[@]}"
|
||||
status=${?}
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
|
||||
export GRAFANA_ADD_TAGS=
|
||||
if [[ ${status} != 0 ]]; then
|
||||
GRAFANA_ADD_TAGS=error
|
||||
fi
|
||||
annotate_grafana_end scenario "Load scenario: ${scenario}"
|
||||
fi
|
||||
exit "${status}"
|
||||
}
|
||||
|
||||
declare -a pids=()
|
||||
declare -A pid_to_scenario=()
|
||||
declare -A failed=()
|
||||
target_start=0
|
||||
target_end=-1
|
||||
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
|
||||
start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
|
||||
fi
|
||||
for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
|
||||
start_phase "Load scenario: ${scenario}"
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
start_phase "Load scenario: ${scenario}"
|
||||
fi
|
||||
|
||||
set +e
|
||||
status=0
|
||||
case "${scenario}" in
|
||||
"SSH Traffic")
|
||||
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" &
|
||||
coder exp scaletest workspace-traffic \
|
||||
greedy_agent_traffic_pid=$!
|
||||
|
||||
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
|
||||
target_end=$((target_start + target_count))
|
||||
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
|
||||
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
|
||||
target_start=0
|
||||
target_end=${target_count}
|
||||
fi
|
||||
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--ssh \
|
||||
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
|
||||
|
@ -107,55 +153,160 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
|
|||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \
|
||||
"${non_greedy_agent_traffic_args[@]}"
|
||||
status=$?
|
||||
wait
|
||||
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
|
||||
--target-workspaces "${target_start}:${target_end}" \
|
||||
"${non_greedy_agent_traffic_args[@]}" &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
wait "${greedy_agent_traffic_pid}"
|
||||
status2=$?
|
||||
if [[ ${status} == 0 ]]; then
|
||||
status=${status2}
|
||||
fi
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
;;
|
||||
"Web Terminal Traffic")
|
||||
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" &
|
||||
coder exp scaletest workspace-traffic \
|
||||
greedy_agent_traffic_pid=$!
|
||||
|
||||
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
|
||||
target_end=$((target_start + target_count))
|
||||
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
|
||||
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
|
||||
target_start=0
|
||||
target_end=${target_count}
|
||||
fi
|
||||
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
|
||||
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \
|
||||
"${non_greedy_agent_traffic_args[@]}"
|
||||
status=$?
|
||||
wait
|
||||
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
|
||||
--target-workspaces "${target_start}:${target_end}" \
|
||||
"${non_greedy_agent_traffic_args[@]}" &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
wait "${greedy_agent_traffic_pid}"
|
||||
status2=$?
|
||||
if [[ ${status} == 0 ]]; then
|
||||
status=${status2}
|
||||
fi
|
||||
;;
|
||||
"App Traffic")
|
||||
greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" &
|
||||
greedy_agent_traffic_pid=$!
|
||||
|
||||
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
|
||||
target_end=$((target_start + target_count))
|
||||
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
|
||||
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
|
||||
target_start=0
|
||||
target_end=${target_count}
|
||||
fi
|
||||
run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
|
||||
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \
|
||||
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
|
||||
--app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \
|
||||
--target-workspaces "${target_start}:${target_end}" \
|
||||
"${non_greedy_agent_traffic_args[@]}" &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json"
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
wait "${greedy_agent_traffic_pid}"
|
||||
status2=$?
|
||||
if [[ ${status} == 0 ]]; then
|
||||
status=${status2}
|
||||
fi
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
|
||||
;;
|
||||
"Dashboard Traffic")
|
||||
coder exp scaletest dashboard \
|
||||
target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
|
||||
target_end=$((target_start + target_count))
|
||||
if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
|
||||
log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
|
||||
target_start=0
|
||||
target_end=${target_count}
|
||||
fi
|
||||
# TODO: Remove this once the dashboard traffic command is fixed,
|
||||
# (i.e. once images are no longer dumped into PWD).
|
||||
mkdir -p dashboard
|
||||
pushd dashboard
|
||||
run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
|
||||
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
|
||||
status=$?
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
|
||||
--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
|
||||
--target-users "${target_start}:${target_end}" \
|
||||
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
|
||||
pids+=($!)
|
||||
popd
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
;;
|
||||
|
||||
# Debug scenarios, for testing the runner.
|
||||
"debug:greedy_agent_traffic")
|
||||
greedy_agent_traffic 10 "${scenario}"
|
||||
status=$?
|
||||
greedy_agent_traffic 10 "${scenario}" &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
;;
|
||||
"debug:success")
|
||||
maybedryrun "$DRY_RUN" sleep 10
|
||||
status=0
|
||||
{
|
||||
maybedryrun "$DRY_RUN" sleep 10
|
||||
true
|
||||
} &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
;;
|
||||
"debug:error")
|
||||
maybedryrun "$DRY_RUN" sleep 10
|
||||
status=1
|
||||
{
|
||||
maybedryrun "$DRY_RUN" sleep 10
|
||||
false
|
||||
} &
|
||||
pids+=($!)
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
|
||||
wait "${pids[-1]}"
|
||||
status=$?
|
||||
else
|
||||
SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
|
@ -163,9 +314,22 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
|
|||
;;
|
||||
esac
|
||||
set -e
|
||||
|
||||
# Allow targeting to be distributed evenly across workspaces when each
|
||||
# scenario is run concurrently and all percentages add up to 100.
|
||||
target_start=${target_end}
|
||||
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
|
||||
pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
|
||||
# Stagger the start of each scenario to avoid a burst of load and deted
|
||||
# problematic scenarios.
|
||||
sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60))
|
||||
continue
|
||||
fi
|
||||
|
||||
if ((status > 0)); then
|
||||
log "Load scenario failed: ${scenario} (exit=${status})"
|
||||
failed+=(["${scenario}"]="$status")
|
||||
failed+=(["${scenario}"]="${status}")
|
||||
PHASE_ADD_TAGS=error end_phase
|
||||
else
|
||||
end_phase
|
||||
|
@ -173,6 +337,25 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
|
|||
|
||||
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
|
||||
done
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
|
||||
wait "${pids[@]}"
|
||||
# Wait on all pids will wait until all have exited, but we need to
|
||||
# check their individual exit codes.
|
||||
for pid in "${pids[@]}"; do
|
||||
wait "${pid}"
|
||||
status=${?}
|
||||
scenario=${pid_to_scenario[${pid}]}
|
||||
if ((status > 0)); then
|
||||
log "Load scenario failed: ${scenario} (exit=${status})"
|
||||
failed+=(["${scenario}"]="${status}")
|
||||
fi
|
||||
done
|
||||
if ((${#failed[@]} > 0)); then
|
||||
PHASE_ADD_TAGS=error end_phase
|
||||
else
|
||||
end_phase
|
||||
fi
|
||||
fi
|
||||
|
||||
if ((${#failed[@]} > 0)); then
|
||||
log "Load scenarios failed: ${!failed[*]}"
|
||||
|
|
|
@ -14,7 +14,11 @@ trap cleanup EXIT
|
|||
|
||||
annotate_grafana "workspace" "Agent stopping..."
|
||||
|
||||
"${SCRIPTS_DIR}/cleanup.sh" shutdown
|
||||
shutdown_event=shutdown_scale_down_only
|
||||
if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then
|
||||
shutdown_event=shutdown
|
||||
fi
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}"
|
||||
|
||||
annotate_grafana_end "workspace" "Agent running"
|
||||
|
||||
|
|
|
@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}"
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then
|
||||
echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Unzip scripts and add to path.
|
||||
# shellcheck disable=SC2153
|
||||
echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."
|
||||
|
|
Loading…
Reference in New Issue