#!/usr/bin/env bash # Copyright Materialize, Inc. and contributors. All rights reserved. # # Use of this software is governed by the Business Source License # included in the LICENSE file at the root of this repository. # # As of the Change Date specified in that file, in accordance with # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0. set -euo pipefail . misc/shlib/shlib.bash builder=${BUILDKITE_PLUGIN_MZCOMPOSE_CI_BUILDER:-min} mzcompose() { stdbuf --output=L --error=L bin/ci-builder run "$builder" bin/mzcompose --find "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION" "$@" } faketty() { script -qfc "$(printf "%q " "$@")" /dev/null } service=${BUILDKITE_PLUGIN_MZCOMPOSE_RUN:-default} run_args=("$service") if read_list BUILDKITE_PLUGIN_MZCOMPOSE_ARGS; then for arg in "${result[@]}"; do run_args+=("$arg") done fi STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S") # Clean up cores here so that just killed processes' core files are ignored cores="$HOME"/cores rm -rf "$cores" parallel-workload-queries.log parallel-workload-queries.log.zst mkdir -m 777 "$cores" # Max 128 characters, so don't use $PWD which will make it too long # Ignore SIGABRT sudo sysctl -w kernel.core_pattern="|/usr/bin/ci-filter-core.sh %s $cores/core.%E.%t" # 3 attempts to download the ci-builder bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo # Start dependencies under a different heading so that the main heading is less # noisy. But not if the service is actually a workflow, in which case it will # do its own dependency management. # Don't use `grep -q`! It will stop the `grep` process before mzcompose might # be finished, thus mzcompose can fail with `write /dev/stdout: broken pipe`. # Since we have `pipefail` set in this script, this would lead to a failure and # we would attempt to bring up the workflow, which will fail with `no such # service: default`. if ! mzcompose --mz-quiet list-workflows | grep "$service" > /dev/null; then ci_collapsed_heading ":docker: Starting dependencies" mzcompose up -d --scale "$service=0" "$service" fi if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then ci_uncollapsed_heading ":docker: Fetching binaries for coverage" mzcompose create mkdir -p coverage/ chmod 777 coverage/ # Not all tests contain all of these containers: mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/sqllogictest coverage/ || true mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/clusterd coverage/ || true mzcompose --mz-quiet cp materialized:/usr/local/bin/environmentd coverage/ || true mzcompose --mz-quiet cp materialized:/usr/local/bin/clusterd coverage/ || true mzcompose --mz-quiet cp testdrive:/usr/local/bin/testdrive coverage/ || true mzcompose --mz-quiet cp balancerd:/usr/local/bin/balancerd coverage/ || true fi if is_truthy "${CI_HEAP_PROFILES:-}"; then (while true; do sleep 5 # faketty because otherwise docker will complain about not being inside # of a TTY when run in a background job faketty bin/ci-builder run stable bin/ci-upload-heap-profiles "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION" done ) & fi EXTRA_ARGS=$(echo "${CI_EXTRA_ARGS:-}" | jq -r ".[\"$BUILDKITE_STEP_KEY\"] // \"\"") TEST_CMD="" if [ "${BUILDKITE_PARALLEL_JOB_COUNT:-1}" -gt 1 ]; then TEST_CMD+="BUILDKITE_PARALLEL_JOB=$BUILDKITE_PARALLEL_JOB BUILDKITE_PARALLEL_JOB_COUNT=$BUILDKITE_PARALLEL_JOB_COUNT " fi if [ "${CI_SYSTEM_PARAMETERS:-}" = "random" ]; then TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS CI_SYSTEM_PARAMETERS_SEED=${CI_SYSTEM_PARAMETERS_SEED:-$BUILDKITE_JOB_ID} " elif [ "${CI_SYSTEM_PARAMETERS:-}" = "minimal" ]; then TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS " fi TEST_CMD+="bin/mzcompose --find $BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION run ${run_args[*]} $EXTRA_ARGS" TEST_DESC="$(mzcompose description)" ci_uncollapsed_heading ":docker: Running \`$TEST_CMD\`" echo "$TEST_DESC" cleanup() { # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s) END_TIME=$(date +%s) ELAPSED=$((END_TIME - START_TIME)) if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log fi echo "--- Post command steps" # Run before potential "run down" in coverage docker ps --all --quiet | xargs --no-run-if-empty docker inspect | jq ' .[] | .Config.Env = ["[REDACTED]"] | .Config.Cmd = ["[REDACTED]"] | .Config.Entrypoint = ["[REDACTED]"] | .Args = ["[REDACTED]"]' > docker-inspect.log # services.log might already exist and contain logs from before composition was downed time=0 if [ -f services.log ]; then # Don't capture log lines we received already time=$(date +%s -r services.log) fi mzcompose logs --no-color --timestamps --since "$time" >> services.log # Sort services.log and remove the timestamps we added to prevent having duplicate timestamps in output. For reference: # https://github.com/moby/moby/issues/33673 # https://github.com/moby/moby/issues/31706 sort -t"|" -k2 < services.log | sed -E "s/ \| [0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]\:[0-5][0-9]:[0-6][0-9]\.[0-9]{9}Z / \| /" > services-sorted.log mv services-sorted.log services.log # shellcheck disable=SC2024 sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log netstat -ant > netstat-ant.log netstat -panelot > netstat-panelot.log ps aux | sed -E "s/\S*mzp_\S*/[REDACTED]/g" > ps-aux.log docker stats --all --no-stream > docker-stats.log mv "$cores" . || true if find cores -name 'core.*' | grep -q .; then # Best effort attempt to fetch interesting executables to get backtrace of core files mzcompose cp sqllogictest:/usr/local/bin/sqllogictest cores/ || true mzcompose cp sqllogictest:/usr/local/bin/clusterd cores/ || true mzcompose cp materialized:/usr/local/bin/environmentd cores/ || true mzcompose cp materialized:/usr/local/bin/clusterd cores/ || true mzcompose cp materialized:/usr/local/bin/materialized cores/ || true mzcompose cp balancerd:/usr/local/bin/balancerd cores/ || true mzcompose cp testdrive:/usr/local/bin/testdrive cores/ || true fi echo "Downing docker containers" mzcompose down --volumes || true # Ignore failures, we still want the rest of the cleanup echo "Finding core files" find cores -name 'core.*' | while read -r core; do exe=$(echo "$core" | sed -e "s/core\.\(.*\)\.[0-9]*/\1/" -e "s/.*\!//") # Core dumps can take a while to be written, so if extracting the info fails, try again later bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || (sleep 2m; bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || true) buildkite-agent artifact upload "$core".txt done # can be huge, clean up rm -rf cores echo "Compressing parallel-workload-queries.log" bin/ci-builder run "$builder" zstd --rm parallel-workload-queries.log || true mapfile -t artifacts < <(printf "run.log\nservices.log\njournalctl-merge.log\nnetstat-ant.log\nnetstat-panelot.log\nps-aux.log\ndocker-inspect.log\n"; find . -name 'junit_*.xml'; find mz_debug_* -name '*.log') artifacts_str=$(IFS=";"; echo "${artifacts[*]}") echo "--- Running trufflehog to scan artifacts for secrets & uploading artifacts" { bin/ci-builder run "$builder" trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log } & unset CI_EXTRA_ARGS # We don't want extra args for the annotation # Continue even if ci-annotate-errors fails CI_ANNOTATE_ERRORS_RESULT=0 # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts { buildkite-agent artifact upload "$artifacts_str" || true } & wait echo "--- Annotating errors" bin/ci-builder run "$builder" bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-desc="$TEST_DESC" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$? buildkite-agent artifact upload "ci-annotate-errors.log" export_cov() { bin/ci-builder run stable rust-cov export \ --ignore-filename-regex=.cargo/ \ --ignore-filename-regex=target/release/ \ --ignore-filename-regex=/cargo/ \ --ignore-filename-regex=/mnt/build/ \ --ignore-filename-regex=/rustc/ \ --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \ > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov } if [ -n "${CI_COVERAGE_ENABLED:-}" ] && [ -z "${BUILDKITE_MZCOMPOSE_PLUGIN_SKIP_COVERAGE:-}" ]; then echo "Generating coverage information" if [ -n "$(find . -name '*.profraw')" ]; then # Workaround for "invalid instrumentation profile data (file header is corrupt)" rm -rf profraws mkdir profraws find . -name '*.profraw' | while read -r i; do cp "$i" profraws rm "$i" bin/ci-builder run stable rust-profdata show profraws/"$(basename "$i")" > /dev/null || rm profraws/"$(basename "$i")" done find profraws -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} + find . -name '*.profraw' -delete ARGS=() for program in clusterd environmentd balancerd sqllogictest testdrive; do if [ -f coverage/"$program" ]; then export_cov coverage/"$program" ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov) fi done rm coverage/"$BUILDKITE_JOB_ID".profdata if [ "${#ARGS[@]}" != 0 ]; then bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov rm coverage/"$BUILDKITE_JOB_ID"-*.lcov bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst fi fi fi ci_unimportant_heading ":docker: Cleaning up after mzcompose" # docker-compose kill may fail attempting to kill containers # that have just exited on their own because of the # "shared-fate" mechanism employed by Mz clusters sudo systemctl restart docker killall -9 clusterd || true # There might be remaining processes from a cargo-test run if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then find . -name '*.profraw' -delete # Remove remaining profraw files from coverage runs fi ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin" docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes if [ "$BUILDKITE_STEP_KEY" = "terraform-aws" ]; then mzcompose run aws-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1 elif [ "$BUILDKITE_STEP_KEY" = "terraform-aws-upgrade" ]; then mzcompose run aws-upgrade --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1 elif [ "$BUILDKITE_STEP_KEY" = "terraform-gcp" ]; then mzcompose run gcp-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1 elif [ "$BUILDKITE_STEP_KEY" = "terraform-azure" ]; then mzcompose run azure-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1 fi rm -rf ~/.kube # Remove potential state from E2E Terraform tests echo "Removing mz-debug files" find . -maxdepth 1 -type d -name 'mz_debug*' -exec rm -r {} + if [ ! -s services.log ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Long single-node Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of txn-wal" ] && [ "$BUILDKITE_LABEL" != "Mz E2E Test" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for DFR)" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for CTF)" ] && [ "$BUILDKITE_LABEL" != "QA Canary Environment Base Load" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Canary Environment" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] && [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]]; then echo "+++ services.log is empty, failing" exit 1 fi rm -f services.log exit "$CI_ANNOTATE_ERRORS_RESULT" } trap cleanup EXIT SIGTERM SIGINT TEST_RESULT=0 # sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view { mzcompose run "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$? if [ "$TEST_RESULT" != "0" ]; then # Give the logs some time to log panics, otherwise they might be missing later sleep 10 fi