feat(scaletest): add grafana annotations and slack reporting (#9852)

Fixes #9575
Fixes #9576
This commit is contained in:
Mathias Fredriksson 2023-09-27 14:44:11 +03:00 committed by GitHub
parent 4e442040f7
commit d8515f02af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 495 additions and 78 deletions

View File

@ -35,14 +35,18 @@ resource "null_resource" "permission_check" {
}
locals {
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
service_account_name = "scaletest-sa"
cpu = 2
memory = 2
home_disk_size = 10
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
service_account_name = "scaletest-sa"
cpu = 16
memory = 64
home_disk_size = 10
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
grafana_url = "https://stats.dev.c8s.io"
grafana_dashboard_uid = "qLVSTR-Vz"
grafana_dashboard_name = "coderv2-loadtest-dashboard"
}
data "coder_provisioner" "me" {
@ -91,15 +95,14 @@ data "coder_parameter" "job_concurrency" {
order = 11
type = "number"
name = "Job concurrency"
default = 10
default = 0
description = "The number of concurrent jobs (e.g. when producing workspace traffic)."
mutable = true
# Setting zero = unlimited, but perhaps not a good idea,
# we can raise this limit instead.
validation {
min = 1
max = 100
min = 0
}
}
@ -197,6 +200,121 @@ data "coder_parameter" "num_workspaces" {
}
}
data "coder_parameter" "load_scenarios" {
order = 22
name = "Load Scenarios"
type = "list(string)"
description = "The load scenarios to run."
mutable = true
ephemeral = true
default = jsonencode([
"SSH Traffic",
"Web Terminal Traffic",
"Dashboard Traffic",
])
}
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
order = 23
name = "SSH Traffic Duration"
type = "number"
description = "The duration of the SSH traffic load scenario in minutes."
mutable = true
default = 30
validation {
min = 1
max = 1440 // 24 hours.
}
}
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
order = 24
name = "SSH Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the SSH traffic load scenario."
mutable = true
default = 1024
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_ssh_tick_interval" {
order = 25
name = "SSH Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
mutable = true
default = 100
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
order = 26
name = "Web Terminal Traffic Duration"
type = "number"
description = "The duration of the web terminal traffic load scenario in minutes."
mutable = true
default = 30
validation {
min = 1
max = 1440 // 24 hours.
}
}
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
order = 27
name = "Web Terminal Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
mutable = true
default = 1024
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
order = 28
name = "Web Terminal Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
mutable = true
default = 100
validation {
min = 1
}
}
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
order = 29
name = "Dashboard Traffic Duration"
type = "number"
description = "The duration of the dashboard traffic load scenario in minutes."
mutable = true
default = 30
validation {
min = 1
max = 1440 // 24 hours.
}
}
data "coder_parameter" "load_scenario_baseline_duration" {
order = 26
name = "Baseline Wait Duration"
type = "number"
description = "The duration to wait before starting a load scenario in minutes."
mutable = true
default = 5
validation {
min = 0
max = 60
}
}
data "coder_parameter" "namespace" {
order = 999
type = "string"
@ -221,6 +339,8 @@ resource "coder_agent" "main" {
CODER_CONFIG_DIR : "/home/coder/.config/coderv2",
CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token,
CODER_URL : data.coder_workspace.me.access_url,
CODER_USER : data.coder_workspace.me.owner,
CODER_WORKSPACE : data.coder_workspace.me.name,
# Global scaletest envs that may affect each `coder exp scaletest` invocation.
CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112",
@ -228,14 +348,29 @@ resource "coder_agent" "main" {
CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
# Expose as params as well, for reporting (TODO(mafredri): refactor, only have one).
SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
# Local envs passed as arguments to `coder exp scaletest` invocations.
SCALETEST_RUN_ID : local.scaletest_run_id,
SCALETEST_RUN_DIR : local.scaletest_run_dir,
SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value,
SCALETEST_SKIP_CLEANUP : "1",
SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
GRAFANA_URL : local.grafana_url,
SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
SCRIPTS_DIR : "/tmp/scripts",
@ -244,12 +379,13 @@ resource "coder_agent" "main" {
vscode = false
ssh_helper = false
}
startup_script_timeout = 3600
shutdown_script_timeout = 1800
startup_script_timeout = 86400
shutdown_script_timeout = 7200
startup_script_behavior = "blocking"
startup_script = file("startup.sh")
shutdown_script = file("shutdown.sh")
# IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item.
# Scaletest metadata.
metadata {
display_name = "Scaletest status"
@ -332,7 +468,7 @@ resource "coder_app" "grafana" {
agent_id = coder_agent.main.id
slug = "00-grafana"
display_name = "Grafana"
url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
icon = "https://grafana.com/static/assets/img/fav32.png"
external = true
}
@ -409,7 +545,7 @@ resource "kubernetes_pod" "main" {
}
# Set the pod delete timeout to termination_grace_period_seconds + 1m.
timeouts {
delete = "32m"
delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s"
}
spec {
security_context {
@ -421,8 +557,9 @@ resource "kubernetes_pod" "main" {
service_account_name = local.service_account_name
# Allow the coder agent to perform graceful shutdown and cleanup of
# scaletest resources, 30 minutes (cleanup timeout) + 1 minute.
termination_grace_period_seconds = 1860
# scaletest resources. We add an extra minute so ensure work
# completion is prioritized over timeout.
termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60
container {
name = "dev"
@ -440,6 +577,24 @@ resource "kubernetes_pod" "main" {
name = "CODER_AGENT_LOG_DIR"
value = "${local.scaletest_run_dir}/logs"
}
env {
name = "GRAFANA_API_TOKEN"
value_from {
secret_key_ref {
name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name
key = "token"
}
}
}
env {
name = "SLACK_WEBHOOK_URL"
value_from {
secret_key_ref {
name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name
key = "url"
}
}
}
resources {
# Set requests and limits values such that we can do performant
# execution of `coder scaletest` commands.
@ -496,7 +651,7 @@ resource "kubernetes_pod" "main" {
match_expressions {
key = "cloud.google.com/gke-nodepool"
operator = "In"
values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces.
values = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces.
}
}
}
@ -505,6 +660,20 @@ resource "kubernetes_pod" "main" {
}
}
data "kubernetes_secret" "grafana_editor_api_token" {
metadata {
name = "grafana-editor-api-token"
namespace = data.coder_parameter.namespace.value
}
}
data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" {
metadata {
name = "slack-scaletest-notifications-webhook-url"
namespace = data.coder_parameter.namespace.value
}
}
resource "kubernetes_manifest" "pod_monitor" {
count = data.coder_workspace.me.start_count
manifest = {

View File

@ -24,7 +24,7 @@ fi
start_phase "Cleanup (${event})"
coder exp scaletest cleanup \
--cleanup-job-timeout 15m \
--cleanup-timeout 30m |
--cleanup-timeout 2h |
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
end_phase

View File

@ -33,7 +33,13 @@ set_status() {
if [[ ${DRY_RUN} == 1 ]]; then
dry_run=" (dry-ryn)"
fi
prev_status=$(get_status)
if [[ ${prev_status} != *"Not started"* ]]; then
annotate_grafana_end "status" "Status: ${prev_status}"
fi
echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status"
annotate_grafana "status" "Status: ${*}"
}
lock_status() {
chmod 0440 "${SCALETEST_STATE_DIR}/status"
@ -51,25 +57,29 @@ phase_num=0
start_phase() {
# This may be incremented from another script, so we read it every time.
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")"
phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}")
fi
phase_num=$((phase_num + 1))
log "Start phase ${phase_num}: ${*}"
echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}"
GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}"
}
end_phase() {
phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)"
phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)
if [[ -z ${phase} ]]; then
log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}"
exit 1
fi
log "End phase ${phase_num}: ${phase}"
echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}"
GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}"
}
get_phase() {
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")"
phase="$(echo "${phase_raw}" | cut -d' ' -f3-)"
phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}")
phase=$(echo "${phase_raw}" | cut -d' ' -f3-)
if [[ ${phase_raw} == *"END:"* ]]; then
phase+=" [done]"
fi
@ -86,9 +96,117 @@ get_previous_phase() {
fi
}
annotate_grafana() {
local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))}
local json resp id
if [[ -z $tags ]]; then
tags="scaletest,runner"
else
tags="scaletest,runner,${tags}"
fi
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
tags="${tags},${GRAFANA_EXTRA_TAGS}"
fi
log "Annotating Grafana (start=${start}): ${text} [${tags}]"
json="$(
jq \
--argjson time "${start}" \
--arg text "${text}" \
--arg tags "${tags}" \
'{time: $time, tags: $tags | split(","), text: $text}' <<<'{}'
)"
if [[ ${DRY_RUN} == 1 ]]; then
log "Would have annotated Grafana, data=${json}"
return 0
fi
if ! resp="$(
curl -sSL \
--insecure \
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
-H "Content-Type: application/json" \
-d "${json}" \
"${GRAFANA_URL}/api/annotations"
)"; then
# Don't abort scaletest just because we couldn't annotate Grafana.
log "Failed to annotate Grafana: ${resp}"
return 0
fi
if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then
log "Failed to annotate Grafana: ${resp}"
return 0
fi
log "Grafana annotation added!"
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
mkdir -p "${SCALETEST_STATE_DIR}"
fi
id="$(jq -r '.id' <<<"${resp}")"
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
}
annotate_grafana_end() {
local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))}
local id json resp
if [[ -z $tags ]]; then
tags="scaletest,runner"
else
tags="scaletest,runner,${tags}"
fi
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
tags="${tags},${GRAFANA_EXTRA_TAGS}"
fi
if [[ ${DRY_RUN} == 1 ]]; then
log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]"
return 0
fi
if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then
log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..."
return 0
fi
log "Annotating Grafana (end=${end}): ${text} [${tags}]"
json="$(
jq \
--argjson timeEnd "${end}" \
'{timeEnd: $timeEnd}' <<<'{}'
)"
if [[ ${DRY_RUN} == 1 ]]; then
log "Would have patched Grafana annotation: id=${id}, data=${json}"
return 0
fi
if ! resp="$(
curl -sSL \
--insecure \
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
-H "Content-Type: application/json" \
-X PATCH \
-d "${json}" \
"${GRAFANA_URL}/api/annotations/${id}"
)"; then
# Don't abort scaletest just because we couldn't annotate Grafana.
log "Failed to annotate Grafana end: ${resp}"
return 0
fi
if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then
log "Failed to annotate Grafana end: ${resp}"
return 0
fi
log "Grafana annotation patched!"
}
wait_baseline() {
s=${1:-2}
start_phase "Waiting ${s}m to establish baseline"
PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline"
maybedryrun "$DRY_RUN" sleep $((s * 60))
end_phase
PHASE_TYPE="phase-wait" end_phase
}

View File

@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do
fi
done
log "Cloning coder/coder repo..."
if [[ ! -d "${HOME}/coder" ]]; then
git clone https://github.com/coder/coder.git "${HOME}/coder"
fi
(cd "${HOME}/coder" && git pull)
log "Creating coder CLI token (needed for cleanup during shutdown)..."
mkdir -p "${CODER_CONFIG_DIR}"

View File

@ -0,0 +1,104 @@
#!/bin/bash
set -euo pipefail
[[ $VERBOSE == 1 ]] && set -x
status=$1
shift
case "${status}" in
started) ;;
completed) ;;
failed) ;;
*)
echo "Unknown status: ${status}" >&2
exit 1
;;
esac
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
. "${SCRIPTS_DIR}/lib.sh"
# NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`.
# https://github.com/coder/coder/issues/9877
CODER_URL="${CODER_URL%/}"
buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")"
server_version="$(jq -r '.version' <<<"${buildinfo}")"
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
# Since `coder show` doesn't support JSON output, we list the workspaces instead.
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
bullet='•'
app_urls_raw="$(jq -r '.latest_build.resources[].agents[]?.apps | map(select(.external == true)) | .[] | .display_name, .url' <<<"${workspace_json}")"
app_urls=()
while read -r app_name; do
read -r app_url
bold=
if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then
# Update Grafana URL with end stamp and make bold.
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
bold='*'
fi
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
done <<<"${app_urls_raw}"
params=()
header=
case "${status}" in
started)
created_at="$(jq -r '.latest_build.created_at' <<<"${workspace_json}")"
params=("${bullet} Options:")
while read -r param; do
params+=(" ${bullet} ${param}")
done <<<"$(jq -r '.latest_build.resources[].agents[]?.environment_variables | to_entries | map(select(.key | startswith("SCALETEST_PARAM_"))) | .[] | "`\(.key)`: `\(.value)`"' <<<"${workspace_json}")"
header="New scaletest started at \`${created_at}\` by \`${initiator_name}\` on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
;;
completed)
completed_at=$(date -Iseconds)
header="Scaletest completed at \`${completed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
;;
failed)
failed_at=$(date -Iseconds)
header="Scaletest failed at \`${failed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
;;
*)
echo "Unknown status: ${status}" >&2
exit 1
;;
esac
text_arr=(
"${header}"
""
"${bullet} Workspace (runner): ${CODER_URL}/@${owner_name}/${workspace_name}"
"${bullet} Run ID: ${SCALETEST_RUN_ID}"
"${app_urls[@]}"
"${params[@]}"
)
text=
for field in "${text_arr[@]}"; do
text+="${field}"$'\n'
done
json=$(
jq -n --arg text "${text}" '{
blocks: [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": $text
}
}
]
}'
)
maybedryrun "${DRY_RUN}" curl -X POST -H 'Content-type: application/json' --data "${json}" "${SLACK_WEBHOOK_URL}"

View File

@ -6,54 +6,61 @@ set -euo pipefail
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
. "${SCRIPTS_DIR}/lib.sh"
mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}")
export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}")
log "Running scaletest..."
set_status Running
start_phase "Creating workspaces"
coder exp scaletest create-workspaces \
--count "${SCALETEST_NUM_WORKSPACES}" \
--template "${SCALETEST_TEMPLATE}" \
--concurrency "${SCALETEST_CREATE_CONCURRENCY}" \
--job-timeout 15m \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--job-timeout 2h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
end_phase
wait_baseline 5
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
start_phase "SSH traffic"
coder exp scaletest workspace-traffic \
--ssh \
--bytes-per-tick 10240 \
--tick-interval 1s \
--timeout 5m \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
end_phase
for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
start_phase "Load scenario: ${scenario}"
case "${scenario}" in
"SSH Traffic")
coder exp scaletest workspace-traffic \
--ssh \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
;;
"Web Terminal Traffic")
coder exp scaletest workspace-traffic \
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
;;
"Dashboard Traffic")
coder exp scaletest dashboard \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
;;
esac
end_phase
wait_baseline 5
start_phase "ReconnectingPTY traffic"
coder exp scaletest workspace-traffic \
--bytes-per-tick 10240 \
--tick-interval 1s \
--timeout 5m \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
end_phase
wait_baseline 5
start_phase "Dashboard traffic"
coder exp scaletest dashboard \
--count "${SCALETEST_NUM_WORKSPACES}" \
--job-timeout 5m \
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
end_phase
wait_baseline 5
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
done
log "Scaletest complete!"
set_status Complete

View File

@ -11,4 +11,8 @@ cleanup() {
}
trap cleanup EXIT
annotate_grafana "workspace" "Agent stopping..."
"${SCRIPTS_DIR}/cleanup.sh" shutdown
annotate_grafana_end "workspace" "Agent running"

View File

@ -12,41 +12,63 @@ mkdir -p "${SCRIPTS_DIR}"
unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}"
rm /tmp/scripts.zip
echo "Cloning coder/coder repo..."
if [[ ! -d "${HOME}/coder" ]]; then
git clone https://github.com/coder/coder.git "${HOME}/coder"
fi
(cd "${HOME}/coder" && git pull)
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
. "${SCRIPTS_DIR}/lib.sh"
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
# Show failure in the UI if script exits with error.
failed_status=Failed
on_exit() {
trap - ERR EXIT
case "${SCALETEST_CLEANUP_STRATEGY}" in
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
on_stop)
# Handled by shutdown script.
;;
on_success)
if [[ $(get_status) != "${failed_status}" ]]; then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
on_error)
if [[ $(get_status) = "${failed_status}" ]]; then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
*)
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
;;
esac
annotate_grafana_end "" "Start scaletest"
}
trap on_exit EXIT
on_err() {
code=${?}
trap - ERR
set +e
log "Scaletest failed!"
set_status "${failed_status}"
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
"${SCRIPTS_DIR}/report.sh" failed
lock_status # Ensure we never rewrite the status after a failure.
}
trap on_err ERR
# Pass session token since `prepare.sh` has not yet run.
CODER_SESSION_TOKEN=$CODER_USER_TOKEN "${SCRIPTS_DIR}/report.sh" started
annotate_grafana "" "Start scaletest"
"${SCRIPTS_DIR}/prepare.sh"
"${SCRIPTS_DIR}/run.sh"
"${SCRIPTS_DIR}/report.sh" completed