feat(scaletest): create automated pprof dumps during scaletest (#9887)

This commit is contained in:
Mathias Fredriksson 2023-09-27 15:58:43 +03:00 committed by GitHub
parent fad02081fc
commit 68738771b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 57 additions and 12 deletions

View File

@ -37,7 +37,7 @@ resource "null_resource" "permission_check" {
locals {
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
service_account_name = "scaletest-sa"
cpu = 16
memory = 64

View File

@ -23,8 +23,8 @@ fi
start_phase "Cleanup (${event})"
coder exp scaletest cleanup \
--cleanup-job-timeout 15m \
--cleanup-timeout 2h |
--cleanup-job-timeout 2h \
--cleanup-timeout 5h |
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
end_phase

View File

@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
# shellcheck disable=SC2034
SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"
mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"
coder() {
maybedryrun "${DRY_RUN}" command coder "${@}"
@ -142,9 +145,6 @@ annotate_grafana() {
log "Grafana annotation added!"
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
mkdir -p "${SCALETEST_STATE_DIR}"
fi
id="$(jq -r '.id' <<<"${resp}")"
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
}

View File

@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
set +x # Avoid logging the token.
# Persist configuration for shutdown script too since the
# owner token is invalidated immediately on workspace stop.
export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
export CODER_SESSION_TOKEN=${CODER_USER_TOKEN}
coder tokens delete scaletest_runner >/dev/null 2>&1 || true
# TODO(mafredri): Set TTL? This could interfere with delayed stop though.
token=$(coder tokens create --name scaletest_runner)
if [[ $DRY_RUN == 1 ]]; then
token=${CODER_SESSION_TOKEN}
fi
unset CODER_SESSION_TOKEN
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).

View File

@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")"
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
# Since `coder show` doesn't support JSON output, we list the workspaces instead.
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
# Use `command` here to bypass dry run.
workspace_json="$(
command coder list --all --output json |
jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]'
)"
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
@ -43,7 +47,7 @@ while read -r app_name; do
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
bold='*'
fi
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}")
done <<<"${app_urls_raw}"
params=()

View File

@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--job-timeout 2h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"

View File

@ -23,22 +23,58 @@ fi
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
{
pids=()
ports=()
declare -A pods=()
next_port=6061
for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do
maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" &
pids+=($!)
ports+=("${next_port}")
pods[${next_port}]="${pod}"
next_port=$((next_port + 1))
done
trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT
while :; do
sleep 285 # ~300 when accounting for profile and trace.
log "Grabbing pprof dumps"
start="$(date +%s)"
annotate_grafana "pprof" "Grab pprof dumps (start=${start})"
for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do
for port in "${ports[@]}"; do
tidy_type="${type//\?/_}"
tidy_type="${tidy_type//=/_}"
maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}"
done
done
annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})"
done
} &
pprof_pid=$!
# Show failure in the UI if script exits with error.
failed_status=Failed
on_exit() {
code=${?}
trap - ERR EXIT
set +e
kill -INT "${pprof_pid}"
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
on_stop)
# Handled by shutdown script.
;;
on_success)
if [[ $(get_status) != "${failed_status}" ]]; then
if ((code == 0)); then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
on_error)
if [[ $(get_status) = "${failed_status}" ]]; then
if ((code > 0)); then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
@ -60,6 +96,8 @@ on_err() {
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
"${SCRIPTS_DIR}/report.sh" failed
lock_status # Ensure we never rewrite the status after a failure.
exit "${code}"
}
trap on_err ERR