123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- #!/usr/bin/env bash
- # Copyright Materialize, Inc. and contributors. All rights reserved.
- #
- # Use of this software is governed by the Business Source License
- # included in the LICENSE file at the root of this repository.
- #
- # As of the Change Date specified in that file, in accordance with
- # the Business Source License, use of this software will be governed
- # by the Apache License, Version 2.0.
- set -euo pipefail
- . misc/shlib/shlib.bash
- builder=${BUILDKITE_PLUGIN_MZCOMPOSE_CI_BUILDER:-min}
- mzcompose() {
- stdbuf --output=L --error=L bin/ci-builder run "$builder" bin/mzcompose --find "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION" "$@"
- }
- faketty() {
- script -qfc "$(printf "%q " "$@")" /dev/null
- }
- service=${BUILDKITE_PLUGIN_MZCOMPOSE_RUN:-default}
- run_args=("$service")
- if read_list BUILDKITE_PLUGIN_MZCOMPOSE_ARGS; then
- for arg in "${result[@]}"; do
- run_args+=("$arg")
- done
- fi
- STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
- # Clean up cores here so that just killed processes' core files are ignored
- cores="$HOME"/cores
- rm -rf "$cores" parallel-workload-queries.log parallel-workload-queries.log.zst
- mkdir -m 777 "$cores"
- # Max 128 characters, so don't use $PWD which will make it too long
- # Ignore SIGABRT
- sudo sysctl -w kernel.core_pattern="|/usr/bin/ci-filter-core.sh %s $cores/core.%E.%t"
- # 3 attempts to download the ci-builder
- bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo
- # Start dependencies under a different heading so that the main heading is less
- # noisy. But not if the service is actually a workflow, in which case it will
- # do its own dependency management.
- # Don't use `grep -q`! It will stop the `grep` process before mzcompose might
- # be finished, thus mzcompose can fail with `write /dev/stdout: broken pipe`.
- # Since we have `pipefail` set in this script, this would lead to a failure and
- # we would attempt to bring up the workflow, which will fail with `no such
- # service: default`.
- if ! mzcompose --mz-quiet list-workflows | grep "$service" > /dev/null; then
- ci_collapsed_heading ":docker: Starting dependencies"
- mzcompose up -d --scale "$service=0" "$service"
- fi
- if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
- ci_uncollapsed_heading ":docker: Fetching binaries for coverage"
- mzcompose create
- mkdir -p coverage/
- chmod 777 coverage/
- # Not all tests contain all of these containers:
- mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/sqllogictest coverage/ || true
- mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/clusterd coverage/ || true
- mzcompose --mz-quiet cp materialized:/usr/local/bin/environmentd coverage/ || true
- mzcompose --mz-quiet cp materialized:/usr/local/bin/clusterd coverage/ || true
- mzcompose --mz-quiet cp testdrive:/usr/local/bin/testdrive coverage/ || true
- mzcompose --mz-quiet cp balancerd:/usr/local/bin/balancerd coverage/ || true
- fi
- if is_truthy "${CI_HEAP_PROFILES:-}"; then
- (while true; do
- sleep 5
- # faketty because otherwise docker will complain about not being inside
- # of a TTY when run in a background job
- faketty bin/ci-builder run stable bin/ci-upload-heap-profiles "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION"
- done
- ) &
- fi
- EXTRA_ARGS=$(echo "${CI_EXTRA_ARGS:-}" | jq -r ".[\"$BUILDKITE_STEP_KEY\"] // \"\"")
- TEST_CMD=""
- if [ "${BUILDKITE_PARALLEL_JOB_COUNT:-1}" -gt 1 ]; then
- TEST_CMD+="BUILDKITE_PARALLEL_JOB=$BUILDKITE_PARALLEL_JOB BUILDKITE_PARALLEL_JOB_COUNT=$BUILDKITE_PARALLEL_JOB_COUNT "
- fi
- if [ "${CI_SYSTEM_PARAMETERS:-}" = "random" ]; then
- TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS CI_SYSTEM_PARAMETERS_SEED=${CI_SYSTEM_PARAMETERS_SEED:-$BUILDKITE_JOB_ID} "
- elif [ "${CI_SYSTEM_PARAMETERS:-}" = "minimal" ]; then
- TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS "
- fi
- TEST_CMD+="bin/mzcompose --find $BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION run ${run_args[*]} $EXTRA_ARGS"
- TEST_DESC="$(mzcompose description)"
- ci_uncollapsed_heading ":docker: Running \`$TEST_CMD\`"
- echo "$TEST_DESC"
- cleanup() {
- # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves
- START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s)
- END_TIME=$(date +%s)
- ELAPSED=$((END_TIME - START_TIME))
- if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then
- printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log
- fi
- echo "--- Post command steps"
- # Run before potential "run down" in coverage
- docker ps --all --quiet | xargs --no-run-if-empty docker inspect | jq '
- .[]
- | .Config.Env = ["[REDACTED]"]
- | .Config.Cmd = ["[REDACTED]"]
- | .Config.Entrypoint = ["[REDACTED]"]
- | .Args = ["[REDACTED]"]' > docker-inspect.log
- # services.log might already exist and contain logs from before composition was downed
- time=0
- if [ -f services.log ]; then
- # Don't capture log lines we received already
- time=$(date +%s -r services.log)
- fi
- mzcompose logs --no-color --timestamps --since "$time" >> services.log
- # Sort services.log and remove the timestamps we added to prevent having duplicate timestamps in output. For reference:
- # https://github.com/moby/moby/issues/33673
- # https://github.com/moby/moby/issues/31706
- sort -t"|" -k2 < services.log | sed -E "s/ \| [0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]\:[0-5][0-9]:[0-6][0-9]\.[0-9]{9}Z / \| /" > services-sorted.log
- mv services-sorted.log services.log
- # shellcheck disable=SC2024
- sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log
- netstat -ant > netstat-ant.log
- netstat -panelot > netstat-panelot.log
- ps aux | sed -E "s/\S*mzp_\S*/[REDACTED]/g" > ps-aux.log
- docker stats --all --no-stream > docker-stats.log
- mv "$cores" . || true
- if find cores -name 'core.*' | grep -q .; then
- # Best effort attempt to fetch interesting executables to get backtrace of core files
- mzcompose cp sqllogictest:/usr/local/bin/sqllogictest cores/ || true
- mzcompose cp sqllogictest:/usr/local/bin/clusterd cores/ || true
- mzcompose cp materialized:/usr/local/bin/environmentd cores/ || true
- mzcompose cp materialized:/usr/local/bin/clusterd cores/ || true
- mzcompose cp materialized:/usr/local/bin/materialized cores/ || true
- mzcompose cp balancerd:/usr/local/bin/balancerd cores/ || true
- mzcompose cp testdrive:/usr/local/bin/testdrive cores/ || true
- fi
- echo "Downing docker containers"
- mzcompose down --volumes || true # Ignore failures, we still want the rest of the cleanup
- echo "Finding core files"
- find cores -name 'core.*' | while read -r core; do
- exe=$(echo "$core" | sed -e "s/core\.\(.*\)\.[0-9]*/\1/" -e "s/.*\!//")
- # Core dumps can take a while to be written, so if extracting the info fails, try again later
- bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || (sleep 2m; bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || true)
- buildkite-agent artifact upload "$core".txt
- done
- # can be huge, clean up
- rm -rf cores
- echo "Compressing parallel-workload-queries.log"
- bin/ci-builder run "$builder" zstd --rm parallel-workload-queries.log || true
- mapfile -t artifacts < <(printf "run.log\nservices.log\njournalctl-merge.log\nnetstat-ant.log\nnetstat-panelot.log\nps-aux.log\ndocker-inspect.log\n"; find . -name 'junit_*.xml'; find mz_debug_* -name '*.log')
- artifacts_str=$(IFS=";"; echo "${artifacts[*]}")
- echo "--- Running trufflehog to scan artifacts for secrets & uploading artifacts"
- {
- bin/ci-builder run "$builder" trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log
- } &
- unset CI_EXTRA_ARGS # We don't want extra args for the annotation
- # Continue even if ci-annotate-errors fails
- CI_ANNOTATE_ERRORS_RESULT=0
- # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts
- {
- buildkite-agent artifact upload "$artifacts_str" || true
- } &
- wait
- echo "--- Annotating errors"
- bin/ci-builder run "$builder" bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-desc="$TEST_DESC" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$?
- buildkite-agent artifact upload "ci-annotate-errors.log"
- export_cov() {
- bin/ci-builder run stable rust-cov export \
- --ignore-filename-regex=.cargo/ \
- --ignore-filename-regex=target/release/ \
- --ignore-filename-regex=/cargo/ \
- --ignore-filename-regex=/mnt/build/ \
- --ignore-filename-regex=/rustc/ \
- --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \
- > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov
- }
- if [ -n "${CI_COVERAGE_ENABLED:-}" ] && [ -z "${BUILDKITE_MZCOMPOSE_PLUGIN_SKIP_COVERAGE:-}" ]; then
- echo "Generating coverage information"
- if [ -n "$(find . -name '*.profraw')" ]; then
- # Workaround for "invalid instrumentation profile data (file header is corrupt)"
- rm -rf profraws
- mkdir profraws
- find . -name '*.profraw' | while read -r i; do
- cp "$i" profraws
- rm "$i"
- bin/ci-builder run stable rust-profdata show profraws/"$(basename "$i")" > /dev/null || rm profraws/"$(basename "$i")"
- done
- find profraws -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} +
- find . -name '*.profraw' -delete
- ARGS=()
- for program in clusterd environmentd balancerd sqllogictest testdrive; do
- if [ -f coverage/"$program" ]; then
- export_cov coverage/"$program"
- ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov)
- fi
- done
- rm coverage/"$BUILDKITE_JOB_ID".profdata
- if [ "${#ARGS[@]}" != 0 ]; then
- bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov
- rm coverage/"$BUILDKITE_JOB_ID"-*.lcov
- bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov
- buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst
- fi
- fi
- fi
- ci_unimportant_heading ":docker: Cleaning up after mzcompose"
- # docker-compose kill may fail attempting to kill containers
- # that have just exited on their own because of the
- # "shared-fate" mechanism employed by Mz clusters
- sudo systemctl restart docker
- killall -9 clusterd || true # There might be remaining processes from a cargo-test run
- if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
- find . -name '*.profraw' -delete # Remove remaining profraw files from coverage runs
- fi
- ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin"
- docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes
- if [ "$BUILDKITE_STEP_KEY" = "terraform-aws" ]; then
- mzcompose run aws-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
- elif [ "$BUILDKITE_STEP_KEY" = "terraform-aws-upgrade" ]; then
- mzcompose run aws-upgrade --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
- elif [ "$BUILDKITE_STEP_KEY" = "terraform-gcp" ]; then
- mzcompose run gcp-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
- elif [ "$BUILDKITE_STEP_KEY" = "terraform-azure" ]; then
- mzcompose run azure-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
- fi
- rm -rf ~/.kube # Remove potential state from E2E Terraform tests
- echo "Removing mz-debug files"
- find . -maxdepth 1 -type d -name 'mz_debug*' -exec rm -r {} +
- if [ ! -s services.log ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Long single-node Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of txn-wal" ] && [ "$BUILDKITE_LABEL" != "Mz E2E Test" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for DFR)" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for CTF)" ] && [ "$BUILDKITE_LABEL" != "QA Canary Environment Base Load" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Canary Environment" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] && [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]]; then
- echo "+++ services.log is empty, failing"
- exit 1
- fi
- rm -f services.log
- exit "$CI_ANNOTATE_ERRORS_RESULT"
- }
- trap cleanup EXIT SIGTERM SIGINT
- TEST_RESULT=0
- # sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view
- { mzcompose run "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$?
- if [ "$TEST_RESULT" != "0" ]; then
- # Give the logs some time to log panics, otherwise they might be missing later
- sleep 10
- fi
|