mirror of https://github.com/coder/coder.git
feat(scaletest): add grafana annotations and slack reporting (#9852)
Fixes #9575 Fixes #9576
This commit is contained in:
parent
4e442040f7
commit
d8515f02af
|
@ -35,14 +35,18 @@ resource "null_resource" "permission_check" {
|
|||
}
|
||||
|
||||
locals {
|
||||
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
service_account_name = "scaletest-sa"
|
||||
cpu = 2
|
||||
memory = 2
|
||||
home_disk_size = 10
|
||||
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
|
||||
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
|
||||
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
|
||||
service_account_name = "scaletest-sa"
|
||||
cpu = 16
|
||||
memory = 64
|
||||
home_disk_size = 10
|
||||
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
|
||||
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
|
||||
grafana_url = "https://stats.dev.c8s.io"
|
||||
grafana_dashboard_uid = "qLVSTR-Vz"
|
||||
grafana_dashboard_name = "coderv2-loadtest-dashboard"
|
||||
}
|
||||
|
||||
data "coder_provisioner" "me" {
|
||||
|
@ -91,15 +95,14 @@ data "coder_parameter" "job_concurrency" {
|
|||
order = 11
|
||||
type = "number"
|
||||
name = "Job concurrency"
|
||||
default = 10
|
||||
default = 0
|
||||
description = "The number of concurrent jobs (e.g. when producing workspace traffic)."
|
||||
mutable = true
|
||||
|
||||
# Setting zero = unlimited, but perhaps not a good idea,
|
||||
# we can raise this limit instead.
|
||||
validation {
|
||||
min = 1
|
||||
max = 100
|
||||
min = 0
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -197,6 +200,121 @@ data "coder_parameter" "num_workspaces" {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
data "coder_parameter" "load_scenarios" {
|
||||
order = 22
|
||||
name = "Load Scenarios"
|
||||
type = "list(string)"
|
||||
description = "The load scenarios to run."
|
||||
mutable = true
|
||||
ephemeral = true
|
||||
default = jsonencode([
|
||||
"SSH Traffic",
|
||||
"Web Terminal Traffic",
|
||||
"Dashboard Traffic",
|
||||
])
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
|
||||
order = 23
|
||||
name = "SSH Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the SSH traffic load scenario in minutes."
|
||||
mutable = true
|
||||
default = 30
|
||||
validation {
|
||||
min = 1
|
||||
max = 1440 // 24 hours.
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
|
||||
order = 24
|
||||
name = "SSH Bytes Per Tick"
|
||||
type = "number"
|
||||
description = "The number of bytes to send per tick in the SSH traffic load scenario."
|
||||
mutable = true
|
||||
default = 1024
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_ssh_tick_interval" {
|
||||
order = 25
|
||||
name = "SSH Tick Interval"
|
||||
type = "number"
|
||||
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
|
||||
order = 26
|
||||
name = "Web Terminal Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the web terminal traffic load scenario in minutes."
|
||||
mutable = true
|
||||
default = 30
|
||||
validation {
|
||||
min = 1
|
||||
max = 1440 // 24 hours.
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
|
||||
order = 27
|
||||
name = "Web Terminal Bytes Per Tick"
|
||||
type = "number"
|
||||
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
|
||||
mutable = true
|
||||
default = 1024
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
|
||||
order = 28
|
||||
name = "Web Terminal Tick Interval"
|
||||
type = "number"
|
||||
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
|
||||
mutable = true
|
||||
default = 100
|
||||
validation {
|
||||
min = 1
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
|
||||
order = 29
|
||||
name = "Dashboard Traffic Duration"
|
||||
type = "number"
|
||||
description = "The duration of the dashboard traffic load scenario in minutes."
|
||||
mutable = true
|
||||
default = 30
|
||||
validation {
|
||||
min = 1
|
||||
max = 1440 // 24 hours.
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "load_scenario_baseline_duration" {
|
||||
order = 26
|
||||
name = "Baseline Wait Duration"
|
||||
type = "number"
|
||||
description = "The duration to wait before starting a load scenario in minutes."
|
||||
mutable = true
|
||||
default = 5
|
||||
validation {
|
||||
min = 0
|
||||
max = 60
|
||||
}
|
||||
}
|
||||
|
||||
data "coder_parameter" "namespace" {
|
||||
order = 999
|
||||
type = "string"
|
||||
|
@ -221,6 +339,8 @@ resource "coder_agent" "main" {
|
|||
CODER_CONFIG_DIR : "/home/coder/.config/coderv2",
|
||||
CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token,
|
||||
CODER_URL : data.coder_workspace.me.access_url,
|
||||
CODER_USER : data.coder_workspace.me.owner,
|
||||
CODER_WORKSPACE : data.coder_workspace.me.name,
|
||||
|
||||
# Global scaletest envs that may affect each `coder exp scaletest` invocation.
|
||||
CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112",
|
||||
|
@ -228,14 +348,29 @@ resource "coder_agent" "main" {
|
|||
CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
|
||||
CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
|
||||
|
||||
# Expose as params as well, for reporting (TODO(mafredri): refactor, only have one).
|
||||
SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}",
|
||||
SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}",
|
||||
|
||||
# Local envs passed as arguments to `coder exp scaletest` invocations.
|
||||
SCALETEST_RUN_ID : local.scaletest_run_id,
|
||||
SCALETEST_RUN_DIR : local.scaletest_run_dir,
|
||||
SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value,
|
||||
SCALETEST_SKIP_CLEANUP : "1",
|
||||
SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
|
||||
SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
|
||||
SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
|
||||
|
||||
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
|
||||
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
|
||||
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
|
||||
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
|
||||
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
|
||||
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
|
||||
|
||||
GRAFANA_URL : local.grafana_url,
|
||||
|
||||
SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
|
||||
SCRIPTS_DIR : "/tmp/scripts",
|
||||
|
@ -244,12 +379,13 @@ resource "coder_agent" "main" {
|
|||
vscode = false
|
||||
ssh_helper = false
|
||||
}
|
||||
startup_script_timeout = 3600
|
||||
shutdown_script_timeout = 1800
|
||||
startup_script_timeout = 86400
|
||||
shutdown_script_timeout = 7200
|
||||
startup_script_behavior = "blocking"
|
||||
startup_script = file("startup.sh")
|
||||
shutdown_script = file("shutdown.sh")
|
||||
|
||||
# IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item.
|
||||
# Scaletest metadata.
|
||||
metadata {
|
||||
display_name = "Scaletest status"
|
||||
|
@ -332,7 +468,7 @@ resource "coder_app" "grafana" {
|
|||
agent_id = coder_agent.main.id
|
||||
slug = "00-grafana"
|
||||
display_name = "Grafana"
|
||||
url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
|
||||
url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
|
||||
icon = "https://grafana.com/static/assets/img/fav32.png"
|
||||
external = true
|
||||
}
|
||||
|
@ -409,7 +545,7 @@ resource "kubernetes_pod" "main" {
|
|||
}
|
||||
# Set the pod delete timeout to termination_grace_period_seconds + 1m.
|
||||
timeouts {
|
||||
delete = "32m"
|
||||
delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s"
|
||||
}
|
||||
spec {
|
||||
security_context {
|
||||
|
@ -421,8 +557,9 @@ resource "kubernetes_pod" "main" {
|
|||
service_account_name = local.service_account_name
|
||||
|
||||
# Allow the coder agent to perform graceful shutdown and cleanup of
|
||||
# scaletest resources, 30 minutes (cleanup timeout) + 1 minute.
|
||||
termination_grace_period_seconds = 1860
|
||||
# scaletest resources. We add an extra minute so ensure work
|
||||
# completion is prioritized over timeout.
|
||||
termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60
|
||||
|
||||
container {
|
||||
name = "dev"
|
||||
|
@ -440,6 +577,24 @@ resource "kubernetes_pod" "main" {
|
|||
name = "CODER_AGENT_LOG_DIR"
|
||||
value = "${local.scaletest_run_dir}/logs"
|
||||
}
|
||||
env {
|
||||
name = "GRAFANA_API_TOKEN"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name
|
||||
key = "token"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "SLACK_WEBHOOK_URL"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name
|
||||
key = "url"
|
||||
}
|
||||
}
|
||||
}
|
||||
resources {
|
||||
# Set requests and limits values such that we can do performant
|
||||
# execution of `coder scaletest` commands.
|
||||
|
@ -496,7 +651,7 @@ resource "kubernetes_pod" "main" {
|
|||
match_expressions {
|
||||
key = "cloud.google.com/gke-nodepool"
|
||||
operator = "In"
|
||||
values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces.
|
||||
values = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -505,6 +660,20 @@ resource "kubernetes_pod" "main" {
|
|||
}
|
||||
}
|
||||
|
||||
data "kubernetes_secret" "grafana_editor_api_token" {
|
||||
metadata {
|
||||
name = "grafana-editor-api-token"
|
||||
namespace = data.coder_parameter.namespace.value
|
||||
}
|
||||
}
|
||||
|
||||
data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" {
|
||||
metadata {
|
||||
name = "slack-scaletest-notifications-webhook-url"
|
||||
namespace = data.coder_parameter.namespace.value
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_manifest" "pod_monitor" {
|
||||
count = data.coder_workspace.me.start_count
|
||||
manifest = {
|
||||
|
|
|
@ -24,7 +24,7 @@ fi
|
|||
start_phase "Cleanup (${event})"
|
||||
coder exp scaletest cleanup \
|
||||
--cleanup-job-timeout 15m \
|
||||
--cleanup-timeout 30m |
|
||||
--cleanup-timeout 2h |
|
||||
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
|
||||
end_phase
|
||||
|
||||
|
|
|
@ -33,7 +33,13 @@ set_status() {
|
|||
if [[ ${DRY_RUN} == 1 ]]; then
|
||||
dry_run=" (dry-ryn)"
|
||||
fi
|
||||
prev_status=$(get_status)
|
||||
if [[ ${prev_status} != *"Not started"* ]]; then
|
||||
annotate_grafana_end "status" "Status: ${prev_status}"
|
||||
fi
|
||||
echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status"
|
||||
|
||||
annotate_grafana "status" "Status: ${*}"
|
||||
}
|
||||
lock_status() {
|
||||
chmod 0440 "${SCALETEST_STATE_DIR}/status"
|
||||
|
@ -51,25 +57,29 @@ phase_num=0
|
|||
start_phase() {
|
||||
# This may be incremented from another script, so we read it every time.
|
||||
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
|
||||
phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")"
|
||||
phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}")
|
||||
fi
|
||||
phase_num=$((phase_num + 1))
|
||||
log "Start phase ${phase_num}: ${*}"
|
||||
echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}"
|
||||
|
||||
GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}"
|
||||
}
|
||||
end_phase() {
|
||||
phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)"
|
||||
phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)
|
||||
if [[ -z ${phase} ]]; then
|
||||
log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
log "End phase ${phase_num}: ${phase}"
|
||||
echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}"
|
||||
|
||||
GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}"
|
||||
}
|
||||
get_phase() {
|
||||
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
|
||||
phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")"
|
||||
phase="$(echo "${phase_raw}" | cut -d' ' -f3-)"
|
||||
phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}")
|
||||
phase=$(echo "${phase_raw}" | cut -d' ' -f3-)
|
||||
if [[ ${phase_raw} == *"END:"* ]]; then
|
||||
phase+=" [done]"
|
||||
fi
|
||||
|
@ -86,9 +96,117 @@ get_previous_phase() {
|
|||
fi
|
||||
}
|
||||
|
||||
annotate_grafana() {
|
||||
local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))}
|
||||
local json resp id
|
||||
|
||||
if [[ -z $tags ]]; then
|
||||
tags="scaletest,runner"
|
||||
else
|
||||
tags="scaletest,runner,${tags}"
|
||||
fi
|
||||
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
|
||||
tags="${tags},${GRAFANA_EXTRA_TAGS}"
|
||||
fi
|
||||
|
||||
log "Annotating Grafana (start=${start}): ${text} [${tags}]"
|
||||
|
||||
json="$(
|
||||
jq \
|
||||
--argjson time "${start}" \
|
||||
--arg text "${text}" \
|
||||
--arg tags "${tags}" \
|
||||
'{time: $time, tags: $tags | split(","), text: $text}' <<<'{}'
|
||||
)"
|
||||
if [[ ${DRY_RUN} == 1 ]]; then
|
||||
log "Would have annotated Grafana, data=${json}"
|
||||
return 0
|
||||
fi
|
||||
if ! resp="$(
|
||||
curl -sSL \
|
||||
--insecure \
|
||||
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "${json}" \
|
||||
"${GRAFANA_URL}/api/annotations"
|
||||
)"; then
|
||||
# Don't abort scaletest just because we couldn't annotate Grafana.
|
||||
log "Failed to annotate Grafana: ${resp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then
|
||||
log "Failed to annotate Grafana: ${resp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Grafana annotation added!"
|
||||
|
||||
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
|
||||
mkdir -p "${SCALETEST_STATE_DIR}"
|
||||
fi
|
||||
id="$(jq -r '.id' <<<"${resp}")"
|
||||
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
|
||||
}
|
||||
annotate_grafana_end() {
|
||||
local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))}
|
||||
local id json resp
|
||||
|
||||
if [[ -z $tags ]]; then
|
||||
tags="scaletest,runner"
|
||||
else
|
||||
tags="scaletest,runner,${tags}"
|
||||
fi
|
||||
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
|
||||
tags="${tags},${GRAFANA_EXTRA_TAGS}"
|
||||
fi
|
||||
|
||||
if [[ ${DRY_RUN} == 1 ]]; then
|
||||
log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then
|
||||
log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..."
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Annotating Grafana (end=${end}): ${text} [${tags}]"
|
||||
|
||||
json="$(
|
||||
jq \
|
||||
--argjson timeEnd "${end}" \
|
||||
'{timeEnd: $timeEnd}' <<<'{}'
|
||||
)"
|
||||
if [[ ${DRY_RUN} == 1 ]]; then
|
||||
log "Would have patched Grafana annotation: id=${id}, data=${json}"
|
||||
return 0
|
||||
fi
|
||||
if ! resp="$(
|
||||
curl -sSL \
|
||||
--insecure \
|
||||
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X PATCH \
|
||||
-d "${json}" \
|
||||
"${GRAFANA_URL}/api/annotations/${id}"
|
||||
)"; then
|
||||
# Don't abort scaletest just because we couldn't annotate Grafana.
|
||||
log "Failed to annotate Grafana end: ${resp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then
|
||||
log "Failed to annotate Grafana end: ${resp}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Grafana annotation patched!"
|
||||
}
|
||||
|
||||
wait_baseline() {
|
||||
s=${1:-2}
|
||||
start_phase "Waiting ${s}m to establish baseline"
|
||||
PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline"
|
||||
maybedryrun "$DRY_RUN" sleep $((s * 60))
|
||||
end_phase
|
||||
PHASE_TYPE="phase-wait" end_phase
|
||||
}
|
||||
|
|
|
@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do
|
|||
fi
|
||||
done
|
||||
|
||||
log "Cloning coder/coder repo..."
|
||||
|
||||
if [[ ! -d "${HOME}/coder" ]]; then
|
||||
git clone https://github.com/coder/coder.git "${HOME}/coder"
|
||||
fi
|
||||
(cd "${HOME}/coder" && git pull)
|
||||
|
||||
log "Creating coder CLI token (needed for cleanup during shutdown)..."
|
||||
|
||||
mkdir -p "${CODER_CONFIG_DIR}"
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
[[ $VERBOSE == 1 ]] && set -x
|
||||
|
||||
status=$1
|
||||
shift
|
||||
|
||||
case "${status}" in
|
||||
started) ;;
|
||||
completed) ;;
|
||||
failed) ;;
|
||||
*)
|
||||
echo "Unknown status: ${status}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
|
||||
. "${SCRIPTS_DIR}/lib.sh"
|
||||
|
||||
# NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`.
|
||||
# https://github.com/coder/coder/issues/9877
|
||||
CODER_URL="${CODER_URL%/}"
|
||||
buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")"
|
||||
server_version="$(jq -r '.version' <<<"${buildinfo}")"
|
||||
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
|
||||
|
||||
# Since `coder show` doesn't support JSON output, we list the workspaces instead.
|
||||
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
|
||||
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
|
||||
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
|
||||
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
|
||||
|
||||
bullet='•'
|
||||
app_urls_raw="$(jq -r '.latest_build.resources[].agents[]?.apps | map(select(.external == true)) | .[] | .display_name, .url' <<<"${workspace_json}")"
|
||||
app_urls=()
|
||||
while read -r app_name; do
|
||||
read -r app_url
|
||||
bold=
|
||||
if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then
|
||||
# Update Grafana URL with end stamp and make bold.
|
||||
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
|
||||
bold='*'
|
||||
fi
|
||||
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
|
||||
done <<<"${app_urls_raw}"
|
||||
|
||||
params=()
|
||||
header=
|
||||
|
||||
case "${status}" in
|
||||
started)
|
||||
created_at="$(jq -r '.latest_build.created_at' <<<"${workspace_json}")"
|
||||
params=("${bullet} Options:")
|
||||
while read -r param; do
|
||||
params+=(" ${bullet} ${param}")
|
||||
done <<<"$(jq -r '.latest_build.resources[].agents[]?.environment_variables | to_entries | map(select(.key | startswith("SCALETEST_PARAM_"))) | .[] | "`\(.key)`: `\(.value)`"' <<<"${workspace_json}")"
|
||||
|
||||
header="New scaletest started at \`${created_at}\` by \`${initiator_name}\` on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
|
||||
;;
|
||||
completed)
|
||||
completed_at=$(date -Iseconds)
|
||||
header="Scaletest completed at \`${completed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
|
||||
;;
|
||||
failed)
|
||||
failed_at=$(date -Iseconds)
|
||||
header="Scaletest failed at \`${failed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)."
|
||||
;;
|
||||
*)
|
||||
echo "Unknown status: ${status}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
text_arr=(
|
||||
"${header}"
|
||||
""
|
||||
"${bullet} Workspace (runner): ${CODER_URL}/@${owner_name}/${workspace_name}"
|
||||
"${bullet} Run ID: ${SCALETEST_RUN_ID}"
|
||||
"${app_urls[@]}"
|
||||
"${params[@]}"
|
||||
)
|
||||
|
||||
text=
|
||||
for field in "${text_arr[@]}"; do
|
||||
text+="${field}"$'\n'
|
||||
done
|
||||
|
||||
json=$(
|
||||
jq -n --arg text "${text}" '{
|
||||
blocks: [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": $text
|
||||
}
|
||||
}
|
||||
]
|
||||
}'
|
||||
)
|
||||
|
||||
maybedryrun "${DRY_RUN}" curl -X POST -H 'Content-type: application/json' --data "${json}" "${SLACK_WEBHOOK_URL}"
|
|
@ -6,54 +6,61 @@ set -euo pipefail
|
|||
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
|
||||
. "${SCRIPTS_DIR}/lib.sh"
|
||||
|
||||
mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}")
|
||||
export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}")
|
||||
|
||||
log "Running scaletest..."
|
||||
set_status Running
|
||||
|
||||
start_phase "Creating workspaces"
|
||||
coder exp scaletest create-workspaces \
|
||||
--count "${SCALETEST_NUM_WORKSPACES}" \
|
||||
--template "${SCALETEST_TEMPLATE}" \
|
||||
--concurrency "${SCALETEST_CREATE_CONCURRENCY}" \
|
||||
--job-timeout 15m \
|
||||
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
|
||||
--job-timeout 2h \
|
||||
--no-cleanup \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
end_phase
|
||||
|
||||
wait_baseline 5
|
||||
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
|
||||
|
||||
start_phase "SSH traffic"
|
||||
coder exp scaletest workspace-traffic \
|
||||
--ssh \
|
||||
--bytes-per-tick 10240 \
|
||||
--tick-interval 1s \
|
||||
--timeout 5m \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
end_phase
|
||||
for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
|
||||
start_phase "Load scenario: ${scenario}"
|
||||
case "${scenario}" in
|
||||
"SSH Traffic")
|
||||
coder exp scaletest workspace-traffic \
|
||||
--ssh \
|
||||
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
|
||||
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
|
||||
;;
|
||||
"Web Terminal Traffic")
|
||||
coder exp scaletest workspace-traffic \
|
||||
--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
|
||||
--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
|
||||
;;
|
||||
"Dashboard Traffic")
|
||||
coder exp scaletest dashboard \
|
||||
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
|
||||
--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
|
||||
--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
|
||||
>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
|
||||
;;
|
||||
esac
|
||||
end_phase
|
||||
|
||||
wait_baseline 5
|
||||
|
||||
start_phase "ReconnectingPTY traffic"
|
||||
coder exp scaletest workspace-traffic \
|
||||
--bytes-per-tick 10240 \
|
||||
--tick-interval 1s \
|
||||
--timeout 5m \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json"
|
||||
end_phase
|
||||
|
||||
wait_baseline 5
|
||||
|
||||
start_phase "Dashboard traffic"
|
||||
coder exp scaletest dashboard \
|
||||
--count "${SCALETEST_NUM_WORKSPACES}" \
|
||||
--job-timeout 5m \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
|
||||
end_phase
|
||||
|
||||
wait_baseline 5
|
||||
wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
|
||||
done
|
||||
|
||||
log "Scaletest complete!"
|
||||
set_status Complete
|
||||
|
|
|
@ -11,4 +11,8 @@ cleanup() {
|
|||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
annotate_grafana "workspace" "Agent stopping..."
|
||||
|
||||
"${SCRIPTS_DIR}/cleanup.sh" shutdown
|
||||
|
||||
annotate_grafana_end "workspace" "Agent running"
|
||||
|
|
|
@ -12,41 +12,63 @@ mkdir -p "${SCRIPTS_DIR}"
|
|||
unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}"
|
||||
rm /tmp/scripts.zip
|
||||
|
||||
echo "Cloning coder/coder repo..."
|
||||
if [[ ! -d "${HOME}/coder" ]]; then
|
||||
git clone https://github.com/coder/coder.git "${HOME}/coder"
|
||||
fi
|
||||
(cd "${HOME}/coder" && git pull)
|
||||
|
||||
# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
|
||||
. "${SCRIPTS_DIR}/lib.sh"
|
||||
|
||||
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
|
||||
|
||||
# Show failure in the UI if script exits with error.
|
||||
failed_status=Failed
|
||||
on_exit() {
|
||||
trap - ERR EXIT
|
||||
|
||||
case "${SCALETEST_CLEANUP_STRATEGY}" in
|
||||
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
|
||||
on_stop)
|
||||
# Handled by shutdown script.
|
||||
;;
|
||||
on_success)
|
||||
if [[ $(get_status) != "${failed_status}" ]]; then
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
|
||||
fi
|
||||
;;
|
||||
on_error)
|
||||
if [[ $(get_status) = "${failed_status}" ]]; then
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
|
||||
;;
|
||||
esac
|
||||
|
||||
annotate_grafana_end "" "Start scaletest"
|
||||
}
|
||||
trap on_exit EXIT
|
||||
|
||||
on_err() {
|
||||
code=${?}
|
||||
trap - ERR
|
||||
set +e
|
||||
|
||||
log "Scaletest failed!"
|
||||
set_status "${failed_status}"
|
||||
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
|
||||
"${SCRIPTS_DIR}/report.sh" failed
|
||||
lock_status # Ensure we never rewrite the status after a failure.
|
||||
}
|
||||
trap on_err ERR
|
||||
|
||||
# Pass session token since `prepare.sh` has not yet run.
|
||||
CODER_SESSION_TOKEN=$CODER_USER_TOKEN "${SCRIPTS_DIR}/report.sh" started
|
||||
annotate_grafana "" "Start scaletest"
|
||||
|
||||
"${SCRIPTS_DIR}/prepare.sh"
|
||||
|
||||
"${SCRIPTS_DIR}/run.sh"
|
||||
|
||||
"${SCRIPTS_DIR}/report.sh" completed
|
||||
|
|
Loading…
Reference in New Issue