mirror of https://github.com/coder/coder.git
feat(scaletest): create automated pprof dumps during scaletest (#9887)
This commit is contained in:
parent
fad02081fc
commit
68738771b9
|
@ -37,7 +37,7 @@ resource "null_resource" "permission_check" {
|
|||
locals {
|
||||
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
|
||||
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
|
||||
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
|
||||
service_account_name = "scaletest-sa"
|
||||
cpu = 16
|
||||
memory = 64
|
||||
|
|
|
@ -23,8 +23,8 @@ fi
|
|||
|
||||
start_phase "Cleanup (${event})"
|
||||
coder exp scaletest cleanup \
|
||||
--cleanup-job-timeout 15m \
|
||||
--cleanup-timeout 2h |
|
||||
--cleanup-job-timeout 2h \
|
||||
--cleanup-timeout 5h |
|
||||
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
|
||||
end_phase
|
||||
|
||||
|
|
|
@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
|
|||
SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
|
||||
# shellcheck disable=SC2034
|
||||
SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
|
||||
SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"
|
||||
|
||||
mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"
|
||||
|
||||
coder() {
|
||||
maybedryrun "${DRY_RUN}" command coder "${@}"
|
||||
|
@ -142,9 +145,6 @@ annotate_grafana() {
|
|||
|
||||
log "Grafana annotation added!"
|
||||
|
||||
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
|
||||
mkdir -p "${SCALETEST_STATE_DIR}"
|
||||
fi
|
||||
id="$(jq -r '.id' <<<"${resp}")"
|
||||
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
|
||||
}
|
||||
|
|
|
@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
|
|||
set +x # Avoid logging the token.
|
||||
# Persist configuration for shutdown script too since the
|
||||
# owner token is invalidated immediately on workspace stop.
|
||||
export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
|
||||
export CODER_SESSION_TOKEN=${CODER_USER_TOKEN}
|
||||
coder tokens delete scaletest_runner >/dev/null 2>&1 || true
|
||||
# TODO(mafredri): Set TTL? This could interfere with delayed stop though.
|
||||
token=$(coder tokens create --name scaletest_runner)
|
||||
if [[ $DRY_RUN == 1 ]]; then
|
||||
token=${CODER_SESSION_TOKEN}
|
||||
fi
|
||||
unset CODER_SESSION_TOKEN
|
||||
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
|
||||
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
|
||||
|
|
|
@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")"
|
|||
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
|
||||
|
||||
# Since `coder show` doesn't support JSON output, we list the workspaces instead.
|
||||
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
|
||||
# Use `command` here to bypass dry run.
|
||||
workspace_json="$(
|
||||
command coder list --all --output json |
|
||||
jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]'
|
||||
)"
|
||||
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
|
||||
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
|
||||
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
|
||||
|
@ -43,7 +47,7 @@ while read -r app_name; do
|
|||
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
|
||||
bold='*'
|
||||
fi
|
||||
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
|
||||
app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}")
|
||||
done <<<"${app_urls_raw}"
|
||||
|
||||
params=()
|
||||
|
|
|
@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \
|
|||
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
|
||||
--template "${SCALETEST_PARAM_TEMPLATE}" \
|
||||
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
|
||||
--job-timeout 2h \
|
||||
--job-timeout 5h \
|
||||
--no-cleanup \
|
||||
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
|
||||
|
|
|
@ -23,22 +23,58 @@ fi
|
|||
|
||||
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
|
||||
|
||||
{
|
||||
pids=()
|
||||
ports=()
|
||||
declare -A pods=()
|
||||
next_port=6061
|
||||
for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do
|
||||
maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" &
|
||||
pids+=($!)
|
||||
ports+=("${next_port}")
|
||||
pods[${next_port}]="${pod}"
|
||||
next_port=$((next_port + 1))
|
||||
done
|
||||
|
||||
trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT
|
||||
|
||||
while :; do
|
||||
sleep 285 # ~300 when accounting for profile and trace.
|
||||
log "Grabbing pprof dumps"
|
||||
start="$(date +%s)"
|
||||
annotate_grafana "pprof" "Grab pprof dumps (start=${start})"
|
||||
for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do
|
||||
for port in "${ports[@]}"; do
|
||||
tidy_type="${type//\?/_}"
|
||||
tidy_type="${tidy_type//=/_}"
|
||||
maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}"
|
||||
done
|
||||
done
|
||||
annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})"
|
||||
done
|
||||
} &
|
||||
pprof_pid=$!
|
||||
|
||||
# Show failure in the UI if script exits with error.
|
||||
failed_status=Failed
|
||||
on_exit() {
|
||||
code=${?}
|
||||
trap - ERR EXIT
|
||||
set +e
|
||||
|
||||
kill -INT "${pprof_pid}"
|
||||
|
||||
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
|
||||
on_stop)
|
||||
# Handled by shutdown script.
|
||||
;;
|
||||
on_success)
|
||||
if [[ $(get_status) != "${failed_status}" ]]; then
|
||||
if ((code == 0)); then
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
|
||||
fi
|
||||
;;
|
||||
on_error)
|
||||
if [[ $(get_status) = "${failed_status}" ]]; then
|
||||
if ((code > 0)); then
|
||||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
|
||||
fi
|
||||
;;
|
||||
|
@ -60,6 +96,8 @@ on_err() {
|
|||
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
|
||||
"${SCRIPTS_DIR}/report.sh" failed
|
||||
lock_status # Ensure we never rewrite the status after a failure.
|
||||
|
||||
exit "${code}"
|
||||
}
|
||||
trap on_err ERR
|
||||
|
||||
|
|
Loading…
Reference in New Issue