feat(scaletest): create automated pprof dumps during scaletest (#9887)

2023-09-27 15:58:43 +03:00 · 2023-09-27 15:58:43 +03:00 · 68738771b9
parent fad02081fc
commit 68738771b9
7 changed files with 57 additions and 12 deletions
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@ -37,7 +37,7 @@ resource "null_resource" "permission_check" {
 locals {
  workspace_pod_name                             = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
  workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
-  workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
+  workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
  service_account_name                           = "scaletest-sa"
  cpu                                            = 16
  memory                                         = 64
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@ -23,8 +23,8 @@ fi

 start_phase "Cleanup (${event})"
 coder exp scaletest cleanup \
-	--cleanup-job-timeout 15m \
-	--cleanup-timeout 2h |
+	--cleanup-job-timeout 2h \
+	--cleanup-timeout 5h |
 	tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
 end_phase

--- a/scaletest/templates/scaletest-runner/scripts/lib.sh
+++ b/scaletest/templates/scaletest-runner/scripts/lib.sh
@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
 SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
 # shellcheck disable=SC2034
 SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
+SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"
+
+mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"

 coder() {
 	maybedryrun "${DRY_RUN}" command coder "${@}"
@ -142,9 +145,6 @@ annotate_grafana() {

 	log "Grafana annotation added!"

-	if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
-		mkdir -p "${SCALETEST_STATE_DIR}"
-	fi
 	id="$(jq -r '.id' <<<"${resp}")"
 	echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
 }
--- a/scaletest/templates/scaletest-runner/scripts/prepare.sh
+++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh
@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
 set +x # Avoid logging the token.
 # Persist configuration for shutdown script too since the
 # owner token is invalidated immediately on workspace stop.
-export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
+export CODER_SESSION_TOKEN=${CODER_USER_TOKEN}
 coder tokens delete scaletest_runner >/dev/null 2>&1 || true
 # TODO(mafredri): Set TTL? This could interfere with delayed stop though.
 token=$(coder tokens create --name scaletest_runner)
+if [[ $DRY_RUN == 1 ]]; then
+	token=${CODER_SESSION_TOKEN}
+fi
 unset CODER_SESSION_TOKEN
 echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
 [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
--- a/scaletest/templates/scaletest-runner/scripts/report.sh
+++ b/scaletest/templates/scaletest-runner/scripts/report.sh
@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")"
 server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"

 # Since `coder show` doesn't support JSON output, we list the workspaces instead.
-workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
+# Use `command` here to bypass dry run.
+workspace_json="$(
+	command coder list --all --output json |
+		jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]'
+)"
 owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
 workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
 initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
@ -43,7 +47,7 @@ while read -r app_name; do
 		app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
 		bold='*'
 	fi
-	app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
+	app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}")
 done <<<"${app_urls_raw}"

 params=()
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \
 	--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
 	--template "${SCALETEST_PARAM_TEMPLATE}" \
 	--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
-	--job-timeout 2h \
+	--job-timeout 5h \
 	--no-cleanup \
 	--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
 show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@ -23,22 +23,58 @@ fi

 annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.

+{
+	pids=()
+	ports=()
+	declare -A pods=()
+	next_port=6061
+	for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do
+		maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" &
+		pids+=($!)
+		ports+=("${next_port}")
+		pods[${next_port}]="${pod}"
+		next_port=$((next_port + 1))
+	done
+
+	trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT
+
+	while :; do
+		sleep 285 # ~300 when accounting for profile and trace.
+		log "Grabbing pprof dumps"
+		start="$(date +%s)"
+		annotate_grafana "pprof" "Grab pprof dumps (start=${start})"
+		for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do
+			for port in "${ports[@]}"; do
+				tidy_type="${type//\?/_}"
+				tidy_type="${tidy_type//=/_}"
+				maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}"
+			done
+		done
+		annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})"
+	done
+} &
+pprof_pid=$!
+
 # Show failure in the UI if script exits with error.
 failed_status=Failed
 on_exit() {
+	code=${?}
 	trap - ERR EXIT
+	set +e
+
+	kill -INT "${pprof_pid}"

 	case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
 	on_stop)
 		# Handled by shutdown script.
 		;;
 	on_success)
-		if [[ $(get_status) != "${failed_status}" ]]; then
+		if ((code == 0)); then
 			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
 		fi
 		;;
 	on_error)
-		if [[ $(get_status) = "${failed_status}" ]]; then
+		if ((code > 0)); then
 			"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
 		fi
 		;;
@ -60,6 +96,8 @@ on_err() {
 	GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
 	"${SCRIPTS_DIR}/report.sh" failed
 	lock_status # Ensure we never rewrite the status after a failure.
+
+	exit "${code}"
 }
 trap on_err ERR