From bc97eaa41bc81d40b0adcdab961ca28d41dc0b66 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Fri, 15 Sep 2023 16:23:37 +0300 Subject: [PATCH] feat(scaletest): add scaletest-runner template (#9662) Closes #9571 --- cli/exp_scaletest.go | 2 +- scaletest/setup/scaletest-sa.yaml | 51 ++ .../templates/scaletest-runner/Dockerfile | 36 ++ .../templates/scaletest-runner/README.md | 9 + scaletest/templates/scaletest-runner/main.tf | 531 ++++++++++++++++++ .../scaletest-runner/metadata_phase.sh | 6 + .../metadata_previous_phase.sh | 6 + .../scaletest-runner/metadata_status.sh | 6 + .../scaletest-runner/scripts/cleanup.sh | 34 ++ .../templates/scaletest-runner/scripts/lib.sh | 94 ++++ .../scaletest-runner/scripts/prepare.sh | 57 ++ .../templates/scaletest-runner/scripts/run.sh | 59 ++ .../templates/scaletest-runner/shutdown.sh | 14 + .../templates/scaletest-runner/startup.sh | 52 ++ 14 files changed, 956 insertions(+), 1 deletion(-) create mode 100644 scaletest/setup/scaletest-sa.yaml create mode 100644 scaletest/templates/scaletest-runner/Dockerfile create mode 100644 scaletest/templates/scaletest-runner/README.md create mode 100644 scaletest/templates/scaletest-runner/main.tf create mode 100755 scaletest/templates/scaletest-runner/metadata_phase.sh create mode 100755 scaletest/templates/scaletest-runner/metadata_previous_phase.sh create mode 100755 scaletest/templates/scaletest-runner/metadata_status.sh create mode 100755 scaletest/templates/scaletest-runner/scripts/cleanup.sh create mode 100644 scaletest/templates/scaletest-runner/scripts/lib.sh create mode 100755 scaletest/templates/scaletest-runner/scripts/prepare.sh create mode 100755 scaletest/templates/scaletest-runner/scripts/run.sh create mode 100755 scaletest/templates/scaletest-runner/shutdown.sh create mode 100755 scaletest/templates/scaletest-runner/startup.sh diff --git a/cli/exp_scaletest.go b/cli/exp_scaletest.go index 7b0ffeacdb..e618a51e19 100644 --- a/cli/exp_scaletest.go +++ b/cli/exp_scaletest.go @@ -857,7 +857,7 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { Flag: "use-host-login", Env: "CODER_SCALETEST_USE_HOST_LOGIN", Default: "false", - Description: "Use the use logged in on the host machine, instead of creating users.", + Description: "Use the user logged in on the host machine, instead of creating users.", Value: clibase.BoolOf(&useHostUser), }, } diff --git a/scaletest/setup/scaletest-sa.yaml b/scaletest/setup/scaletest-sa.yaml new file mode 100644 index 0000000000..96d9747d84 --- /dev/null +++ b/scaletest/setup/scaletest-sa.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scaletest-sa + namespace: coder-big +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: scaletest-role + namespace: coder-big +rules: + - apiGroups: + - "" + resources: ["*"] + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - deletecollection +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: scaletest-rolebinding + namespace: coder-big +subjects: + - kind: ServiceAccount + name: scaletest-sa +roleRef: + kind: Role + name: scaletest-role +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: coder-provisioner-podmonitor-rolebinding + namespace: coder-big +subjects: + - kind: ServiceAccount + name: coder-provisioner +roleRef: + kind: Role + name: coder-podmonitor +--- + diff --git a/scaletest/templates/scaletest-runner/Dockerfile b/scaletest/templates/scaletest-runner/Dockerfile new file mode 100644 index 0000000000..9aa016b534 --- /dev/null +++ b/scaletest/templates/scaletest-runner/Dockerfile @@ -0,0 +1,36 @@ +# This image is used to run scaletest jobs and, although it is inside +# the template directory, it is built separately and pushed to +# gcr.io/coder-dev-1/scaletest-runner:latest. +# +# Future improvements will include versioning and including the version +# in the template push. + +FROM codercom/enterprise-base:ubuntu + +ARG DEBIAN_FRONTEND=noninteractive + +USER root + +# TODO(mafredri): Remove unneeded dependencies once we have a clear idea of what's needed. +RUN wget --quiet -O /tmp/terraform.zip https://releases.hashicorp.com/terraform/1.5.7/terraform_1.5.7_linux_amd64.zip \ + && unzip /tmp/terraform.zip -d /usr/local/bin \ + && rm /tmp/terraform.zip \ + && terraform --version + +RUN wget --quiet -O /tmp/envsubst "https://github.com/a8m/envsubst/releases/download/v1.2.0/envsubst-$(uname -s)-$(uname -m)" \ + && chmod +x /tmp/envsubst \ + && mv /tmp/envsubst /usr/local/bin + +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \ + && apt-get update \ + && apt-get install --yes \ + google-cloud-cli \ + jq \ + kubectl \ + zstd \ + && gcloud --version \ + && kubectl version --client \ + && rm -rf /var/lib/apt/lists/* + +USER coder diff --git a/scaletest/templates/scaletest-runner/README.md b/scaletest/templates/scaletest-runner/README.md new file mode 100644 index 0000000000..6c048211e1 --- /dev/null +++ b/scaletest/templates/scaletest-runner/README.md @@ -0,0 +1,9 @@ +--- +name: Scaletest Runner +description: Run a scaletest. +tags: [local] +--- + +# Scaletest Runner + +Run a scaletest. diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf new file mode 100644 index 0000000000..4802c98877 --- /dev/null +++ b/scaletest/templates/scaletest-runner/main.tf @@ -0,0 +1,531 @@ +terraform { + required_providers { + coder = { + source = "coder/coder" + version = "~> 0.11" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.22" + } + } +} + +resource "time_static" "start_time" { + # We con't set `count = data.coder_workspace.me.start_count` here because then + # we can't use this value in `locals`. The permission check is recreated on + # start, which will update the timestamp. + triggers = { + count : length(null_resource.permission_check) + } +} + +resource "null_resource" "permission_check" { + count = data.coder_workspace.me.start_count + + # Limit which users can create a workspace in this template. + # The "default" user and workspace are present because they are needed + # for the plan, and consequently, updating the template. + lifecycle { + precondition { + condition = can(regex("^(default/default|scaletest/runner)$", "${data.coder_workspace.me.owner}/${data.coder_workspace.me.name}")) + error_message = "User and workspace name is not allowed, expected 'scaletest/runner'." + } + } +} + +locals { + workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + service_account_name = "scaletest-sa" + cpu = 2 + memory = 2 + home_disk_size = 10 + scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" + scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" +} + +data "coder_provisioner" "me" { +} + +data "coder_workspace" "me" { +} + +data "coder_parameter" "verbose" { + order = 1 + type = "bool" + name = "Verbose" + default = false + description = "Show debug output." + mutable = true + ephemeral = true +} + +data "coder_parameter" "dry_run" { + order = 2 + type = "bool" + name = "Dry-run" + default = true + description = "Perform a dry-run to see what would happen." + mutable = true + ephemeral = true +} + +data "coder_parameter" "create_concurrency" { + order = 10 + type = "number" + name = "Create concurrency" + default = 10 + description = "The number of workspaces to create concurrently." + mutable = true + + # Setting zero = unlimited, but perhaps not a good idea, + # we can raise this limit instead. + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "job_concurrency" { + order = 11 + type = "number" + name = "Job concurrency" + default = 10 + description = "The number of concurrent jobs (e.g. when producing workspace traffic)." + mutable = true + + # Setting zero = unlimited, but perhaps not a good idea, + # we can raise this limit instead. + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "cleanup_concurrency" { + order = 12 + type = "number" + name = "Cleanup concurrency" + default = 10 + description = "The number of concurrent cleanup jobs." + mutable = true + + # Setting zero = unlimited, but perhaps not a good idea, + # we can raise this limit instead. + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "cleanup_strategy" { + order = 13 + name = "Cleanup strategy" + default = "always" + description = "The strategy used to cleanup workspaces after the scaletest is complete." + mutable = true + ephemeral = true + option { + name = "Always" + value = "always" + description = "Automatically cleanup workspaces after the scaletest ends." + } + option { + name = "On stop" + value = "on_stop" + description = "Cleanup workspaces when the workspace is stopped." + } + option { + name = "On success" + value = "on_success" + description = "Automatically cleanup workspaces after the scaletest is complete if no error occurs." + } + option { + name = "On error" + value = "on_error" + description = "Automatically cleanup workspaces after the scaletest is complete if an error occurs." + } +} + + +data "coder_parameter" "workspace_template" { + order = 20 + name = "workspace_template" + display_name = "Workspace Template" + description = "The template used for workspace creation." + default = "kubernetes-minimal" + icon = "/emojis/1f4dc.png" # Scroll. + mutable = true + option { + name = "Minimal" + value = "kubernetes-minimal" # Feather. + icon = "/emojis/1fab6.png" + description = "Sized to fit approx. 32 per t2d-standard-8 instance." + } + option { + name = "Small" + value = "kubernetes-small" + icon = "/emojis/1f42d.png" # Mouse. + description = "Provisions a small-sized workspace with no persistent storage." + } + option { + name = "Medium" + value = "kubernetes-medium" + icon = "/emojis/1f436.png" # Dog. + description = "Provisions a medium-sized workspace with no persistent storage." + } + option { + name = "Large" + value = "kubernetes-large" + icon = "/emojis/1f434.png" # Horse. + description = "Provisions a large-sized workspace with no persistent storage." + } +} + +data "coder_parameter" "num_workspaces" { + order = 21 + type = "number" + name = "Number of workspaces to create" + default = 100 + description = "The scaletest suite will create this number of workspaces." + mutable = true + + validation { + min = 0 + max = 1000 + } +} + +data "coder_parameter" "namespace" { + order = 999 + type = "string" + name = "Namespace" + default = "coder-big" + description = "The Kubernetes namespace to create the scaletest runner resources in." +} + +data "archive_file" "scripts_zip" { + type = "zip" + output_path = "${path.module}/scripts.zip" + source_dir = "${path.module}/scripts" +} + +resource "coder_agent" "main" { + arch = data.coder_provisioner.me.arch + dir = local.scaletest_run_dir + os = "linux" + env = { + VERBOSE : data.coder_parameter.verbose.value ? "1" : "0", + DRY_RUN : data.coder_parameter.dry_run.value ? "1" : "0", + CODER_CONFIG_DIR : "/home/coder/.config/coderv2", + CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token, + CODER_URL : data.coder_workspace.me.access_url, + + # Global scaletest envs that may affect each `coder exp scaletest` invocation. + CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112", + CODER_SCALETEST_PROMETHEUS_WAIT : "60s", + CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}", + CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}", + + # Local envs passed as arguments to `coder exp scaletest` invocations. + SCALETEST_RUN_ID : local.scaletest_run_id, + SCALETEST_RUN_DIR : local.scaletest_run_dir, + SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value, + SCALETEST_SKIP_CLEANUP : "1", + SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, + SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", + SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + + SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path), + SCRIPTS_DIR : "/tmp/scripts", + } + display_apps { + vscode = false + ssh_helper = false + } + startup_script_timeout = 3600 + shutdown_script_timeout = 1800 + startup_script_behavior = "blocking" + startup_script = file("startup.sh") + shutdown_script = file("shutdown.sh") + + # Scaletest metadata. + metadata { + display_name = "Scaletest status" + key = "00_scaletest_status" + script = file("metadata_status.sh") + interval = 1 + timeout = 1 + } + + metadata { + display_name = "Scaletest phase" + key = "01_scaletest_phase" + script = file("metadata_phase.sh") + interval = 1 + timeout = 1 + } + + metadata { + display_name = "Scaletest phase (previous)" + key = "02_scaletest_previous_phase" + script = file("metadata_previous_phase.sh") + interval = 1 + timeout = 1 + } + + # Misc workspace metadata. + metadata { + display_name = "CPU Usage" + key = "80_cpu_usage" + script = "coder stat cpu" + interval = 10 + timeout = 1 + } + + metadata { + display_name = "RAM Usage" + key = "81_ram_usage" + script = "coder stat mem" + interval = 10 + timeout = 1 + } + + metadata { + display_name = "Home Disk" + key = "82_home_disk" + script = "coder stat disk --path $${HOME}" + interval = 60 + timeout = 1 + } + + metadata { + display_name = "CPU Usage (Host)" + key = "83_cpu_usage_host" + script = "coder stat cpu --host" + interval = 10 + timeout = 1 + } + + metadata { + display_name = "Memory Usage (Host)" + key = "84_mem_usage_host" + script = "coder stat mem --host" + interval = 10 + timeout = 1 + } + + metadata { + display_name = "Load Average (Host)" + key = "85_load_host" + # Get load avg scaled by number of cores. + script = <<-EOS + echo "`cat /proc/loadavg | awk '{ print $1 }'` `nproc`" | awk '{ printf "%0.2f", $1/$2 }' + EOS + interval = 60 + timeout = 1 + } +} + +resource "coder_app" "grafana" { + agent_id = coder_agent.main.id + slug = "00-grafana" + display_name = "Grafana" + url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now" + icon = "https://grafana.com/static/assets/img/fav32.png" + external = true +} + +resource "coder_app" "prometheus" { + agent_id = coder_agent.main.id + slug = "01-prometheus" + display_name = "Prometheus" + // https://stats.dev.c8s.io:9443/classic/graph?g0.range_input=2h&g0.end_input=2023-09-08%2015%3A58&g0.stacked=0&g0.expr=rate(pg_stat_database_xact_commit%7Bcluster%3D%22big%22%2Cdatname%3D%22big-coder%22%7D%5B1m%5D)&g0.tab=0 + url = "https://stats.dev.c8s.io:9443" + icon = "https://prometheus.io/assets/favicons/favicon-32x32.png" + external = true +} + +resource "coder_app" "manual_cleanup" { + agent_id = coder_agent.main.id + slug = "02-manual-cleanup" + display_name = "Manual cleanup" + icon = "/emojis/1f9f9.png" + command = "/tmp/scripts/cleanup.sh manual" +} + +resource "kubernetes_persistent_volume_claim" "home" { + depends_on = [null_resource.permission_check] + metadata { + name = "${local.workspace_pod_name}-home" + namespace = data.coder_parameter.namespace.value + labels = { + "app.kubernetes.io/name" = "coder-pvc" + "app.kubernetes.io/instance" = "coder-pvc-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + "app.kubernetes.io/part-of" = "coder" + // Coder specific labels. + "com.coder.resource" = "true" + "com.coder.workspace.id" = data.coder_workspace.me.id + "com.coder.workspace.name" = data.coder_workspace.me.name + "com.coder.user.id" = data.coder_workspace.me.owner_id + "com.coder.user.username" = data.coder_workspace.me.owner + } + annotations = { + "com.coder.user.email" = data.coder_workspace.me.owner_email + } + } + wait_until_bound = false + spec { + access_modes = ["ReadWriteOnce"] + resources { + requests = { + storage = "${local.home_disk_size}Gi" + } + } + } +} + +resource "kubernetes_pod" "main" { + depends_on = [null_resource.permission_check] + count = data.coder_workspace.me.start_count + metadata { + name = local.workspace_pod_name + namespace = data.coder_parameter.namespace.value + labels = { + "app.kubernetes.io/name" = "coder-workspace" + "app.kubernetes.io/instance" = local.workspace_pod_instance + "app.kubernetes.io/part-of" = "coder" + // Coder specific labels. + "com.coder.resource" = "true" + "com.coder.workspace.id" = data.coder_workspace.me.id + "com.coder.workspace.name" = data.coder_workspace.me.name + "com.coder.user.id" = data.coder_workspace.me.owner_id + "com.coder.user.username" = data.coder_workspace.me.owner + } + annotations = { + "com.coder.user.email" = data.coder_workspace.me.owner_email + } + } + # Set the pod delete timeout to termination_grace_period_seconds + 1m. + timeouts { + delete = "32m" + } + spec { + security_context { + run_as_user = "1000" + fs_group = "1000" + } + + # Allow this pod to perform scale tests. + service_account_name = local.service_account_name + + # Allow the coder agent to perform graceful shutdown and cleanup of + # scaletest resources, 30 minutes (cleanup timeout) + 1 minute. + termination_grace_period_seconds = 1860 + + container { + name = "dev" + image = "gcr.io/coder-dev-1/scaletest-runner:latest" + image_pull_policy = "Always" + command = ["sh", "-c", coder_agent.main.init_script] + security_context { + run_as_user = "1000" + } + env { + name = "CODER_AGENT_TOKEN" + value = coder_agent.main.token + } + env { + name = "CODER_AGENT_LOG_DIR" + value = "${local.scaletest_run_dir}/logs" + } + resources { + # Set requests and limits values such that we can do performant + # execution of `coder scaletest` commands. + requests = { + "cpu" = "250m" + "memory" = "512Mi" + } + limits = { + "cpu" = "${local.cpu}" + "memory" = "${local.memory}Gi" + } + } + volume_mount { + mount_path = "/home/coder" + name = "home" + read_only = false + } + port { + container_port = 21112 + name = "prometheus-http" + protocol = "TCP" + } + } + + volume { + name = "home" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.home.metadata.0.name + read_only = false + } + } + + affinity { + pod_anti_affinity { + // This affinity attempts to spread out all workspace pods evenly across + // nodes. + preferred_during_scheduling_ignored_during_execution { + weight = 1 + pod_affinity_term { + topology_key = "kubernetes.io/hostname" + label_selector { + match_expressions { + key = "app.kubernetes.io/name" + operator = "In" + values = ["coder-workspace"] + } + } + } + } + } + node_affinity { + required_during_scheduling_ignored_during_execution { + node_selector_term { + match_expressions { + key = "cloud.google.com/gke-nodepool" + operator = "In" + values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces. + } + } + } + } + } + } +} + +resource "kubernetes_manifest" "pod_monitor" { + count = data.coder_workspace.me.start_count + manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PodMonitor" + metadata = { + namespace = data.coder_parameter.namespace.value + name = "podmonitor-${local.workspace_pod_name}" + } + spec = { + selector = { + matchLabels = { + "app.kubernetes.io/instance" : local.workspace_pod_instance + } + } + podMetricsEndpoints = [ + { + port = "prometheus-http" + interval = "15s" + } + ] + } + } +} diff --git a/scaletest/templates/scaletest-runner/metadata_phase.sh b/scaletest/templates/scaletest-runner/metadata_phase.sh new file mode 100755 index 0000000000..755a8ba084 --- /dev/null +++ b/scaletest/templates/scaletest-runner/metadata_phase.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +get_phase diff --git a/scaletest/templates/scaletest-runner/metadata_previous_phase.sh b/scaletest/templates/scaletest-runner/metadata_previous_phase.sh new file mode 100755 index 0000000000..c858687b72 --- /dev/null +++ b/scaletest/templates/scaletest-runner/metadata_previous_phase.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" 2>/dev/null || return + +get_previous_phase diff --git a/scaletest/templates/scaletest-runner/metadata_status.sh b/scaletest/templates/scaletest-runner/metadata_status.sh new file mode 100755 index 0000000000..8ec45f0875 --- /dev/null +++ b/scaletest/templates/scaletest-runner/metadata_status.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" 2>/dev/null || return + +get_status diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh new file mode 100755 index 0000000000..a6d29211a0 --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +event=${1:-} + +if [[ -z $event ]]; then + event=manual +fi + +if [[ $event = manual ]]; then + echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) ' + read -r -n 1 + if [[ $REPLY != [yY] ]]; then + echo $'\nAborting...' + exit 1 + fi +fi + +start_phase "Cleanup (${event})" +coder exp scaletest cleanup \ + --cleanup-job-timeout 15m \ + --cleanup-timeout 30m | + tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" +end_phase + +if [[ $event = manual ]]; then + echo 'Press any key to continue...' + read -s -r -n 1 +fi diff --git a/scaletest/templates/scaletest-runner/scripts/lib.sh b/scaletest/templates/scaletest-runner/scripts/lib.sh new file mode 100644 index 0000000000..d392d09681 --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/lib.sh @@ -0,0 +1,94 @@ +#!/bin/bash +set -euo pipefail + +# Only source this script once, this env comes from sourcing +# scripts/lib.sh from coder/coder below. +if [[ ${SCRIPTS_LIB_IS_SOURCED:-0} == 1 ]]; then + return 0 +fi + +# Source scripts/lib.sh from coder/coder for common functions. +# shellcheck source=scripts/lib.sh +. "${HOME}/coder/scripts/lib.sh" + +# Make shellcheck happy. +DRY_RUN=${DRY_RUN:-0} + +# Environment variables shared between scripts. +SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state" +SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase" +# shellcheck disable=SC2034 +SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results" + +coder() { + maybedryrun "${DRY_RUN}" command coder "${@}" +} + +show_json() { + maybedryrun "${DRY_RUN}" jq 'del(.. | .logs?)' "${1}" +} + +set_status() { + dry_run= + if [[ ${DRY_RUN} == 1 ]]; then + dry_run=" (dry-ryn)" + fi + echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status" +} +lock_status() { + chmod 0440 "${SCALETEST_STATE_DIR}/status" +} +get_status() { + # Order of importance (reverse of creation). + if [[ -f "${SCALETEST_STATE_DIR}/status" ]]; then + tail -n1 "${SCALETEST_STATE_DIR}/status" | cut -d' ' -f2- + else + echo "Not started" + fi +} + +phase_num=0 +start_phase() { + # This may be incremented from another script, so we read it every time. + if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then + phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")" + fi + phase_num=$((phase_num + 1)) + log "Start phase ${phase_num}: ${*}" + echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}" +} +end_phase() { + phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)" + if [[ -z ${phase} ]]; then + log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}" + exit 1 + fi + log "End phase ${phase_num}: ${phase}" + echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}" +} +get_phase() { + if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then + phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")" + phase="$(echo "${phase_raw}" | cut -d' ' -f3-)" + if [[ ${phase_raw} == *"END:"* ]]; then + phase+=" [done]" + fi + echo "${phase}" + else + echo "None" + fi +} +get_previous_phase() { + if [[ -f "${SCALETEST_PHASE_FILE}" ]] && [[ $(grep -c START: "${SCALETEST_PHASE_FILE}") -gt 1 ]]; then + grep START: "${SCALETEST_PHASE_FILE}" | tail -n2 | head -n1 | cut -d' ' -f3- + else + echo "None" + fi +} + +wait_baseline() { + s=${1:-2} + start_phase "Waiting ${s}m to establish baseline" + maybedryrun "$DRY_RUN" sleep $((s * 60)) + end_phase +} diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh new file mode 100755 index 0000000000..f6fbcb7dd3 --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +mkdir -p "${SCALETEST_STATE_DIR}" +mkdir -p "${SCALETEST_RESULTS_DIR}" + +log "Preparing scaletest workspace environment..." +set_status Preparing + +log "Compressing previous run logs (if applicable)..." +mkdir -p "${HOME}/archive" +for dir in "${HOME}/scaletest-"*; do + if [[ ${dir} = "${SCALETEST_RUN_DIR}" ]]; then + continue + fi + if [[ -d ${dir} ]]; then + name="$(basename "${dir}")" + ( + cd "$(dirname "${dir}")" + ZSTD_CLEVEL=12 maybedryrun "$DRY_RUN" tar --zstd -cf "${HOME}/archive/${name}.tar.zst" "${name}" + ) + maybedryrun "$DRY_RUN" rm -rf "${dir}" + fi +done + +log "Cloning coder/coder repo..." + +if [[ ! -d "${HOME}/coder" ]]; then + git clone https://github.com/coder/coder.git "${HOME}/coder" +fi +(cd "${HOME}/coder" && git pull) + +log "Creating coder CLI token (needed for cleanup during shutdown)..." + +mkdir -p "${CODER_CONFIG_DIR}" +echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url" + +set +x # Avoid logging the token. +# Persist configuration for shutdown script too since the +# owner token is invalidated immediately on workspace stop. +export CODER_SESSION_TOKEN=$CODER_USER_TOKEN +coder tokens delete scaletest_runner >/dev/null 2>&1 || true +# TODO(mafredri): Set TTL? This could interfere with delayed stop though. +token=$(coder tokens create --name scaletest_runner) +unset CODER_SESSION_TOKEN +echo -n "${token}" >"${CODER_CONFIG_DIR}/session" +[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled). + +log "Cleaning up from previous runs (if applicable)..." +"${SCRIPTS_DIR}/cleanup.sh" "prepare" + +log "Preparation complete!" diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh new file mode 100755 index 0000000000..7ebf8c4310 --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +log "Running scaletest..." +set_status Running + +start_phase "Creating workspaces" +coder exp scaletest create-workspaces \ + --count "${SCALETEST_NUM_WORKSPACES}" \ + --template "${SCALETEST_TEMPLATE}" \ + --concurrency "${SCALETEST_CREATE_CONCURRENCY}" \ + --job-timeout 15m \ + --no-cleanup \ + --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" +show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" +end_phase + +wait_baseline 5 + +start_phase "SSH traffic" +coder exp scaletest workspace-traffic \ + --ssh \ + --bytes-per-tick 10240 \ + --tick-interval 1s \ + --timeout 5m \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" +show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" +end_phase + +wait_baseline 5 + +start_phase "ReconnectingPTY traffic" +coder exp scaletest workspace-traffic \ + --bytes-per-tick 10240 \ + --tick-interval 1s \ + --timeout 5m \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" +show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" +end_phase + +wait_baseline 5 + +start_phase "Dashboard traffic" +coder exp scaletest dashboard \ + --count "${SCALETEST_NUM_WORKSPACES}" \ + --job-timeout 5m \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" +show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" +end_phase + +wait_baseline 5 + +log "Scaletest complete!" +set_status Complete diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh new file mode 100755 index 0000000000..fe621afe4c --- /dev/null +++ b/scaletest/templates/scaletest-runner/shutdown.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +[[ $VERBOSE == 1 ]] && set -x + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +cleanup() { + coder tokens remove scaletest_runner >/dev/null 2>&1 || true +} +trap cleanup EXIT + +"${SCRIPTS_DIR}/cleanup.sh" shutdown diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh new file mode 100755 index 0000000000..0d7c8fb144 --- /dev/null +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +# Unzip scripts and add to path. +# shellcheck disable=SC2153 +echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..." +base64 -d <<<"${SCRIPTS_ZIP}" >/tmp/scripts.zip +rm -rf "${SCRIPTS_DIR}" || true +mkdir -p "${SCRIPTS_DIR}" +unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}" +rm /tmp/scripts.zip + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +# Show failure in the UI if script exits with error. +failed_status=Failed +on_exit() { + trap - ERR EXIT + + case "${SCALETEST_CLEANUP_STRATEGY}" in + on_stop) + # Handled by shutdown script. + ;; + on_success) + if [[ $(get_status) != "${failed_status}" ]]; then + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + fi + ;; + on_error) + if [[ $(get_status) = "${failed_status}" ]]; then + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + fi + ;; + *) + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + ;; + esac +} +trap on_exit EXIT + +on_err() { + log "Scaletest failed!" + set_status "${failed_status}" + lock_status # Ensure we never rewrite the status after a failure. +} +trap on_err ERR + +"${SCRIPTS_DIR}/prepare.sh" +"${SCRIPTS_DIR}/run.sh"