rust
/
Materialize


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
							#!/usr/bin/env bash

# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Use of this software is governed by the Business Source License
# included in the LICENSE file at the root of this repository.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0.

set -euo pipefail

. misc/shlib/shlib.bash

builder=${BUILDKITE_PLUGIN_MZCOMPOSE_CI_BUILDER:-min}

mzcompose() {
    stdbuf --output=L --error=L bin/ci-builder run "$builder" bin/mzcompose --find "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION" "$@"
}

faketty() {
    script -qfc "$(printf "%q " "$@")" /dev/null
}

service=${BUILDKITE_PLUGIN_MZCOMPOSE_RUN:-default}
run_args=("$service")
if read_list BUILDKITE_PLUGIN_MZCOMPOSE_ARGS; then
    for arg in "${result[@]}"; do
        run_args+=("$arg")
    done
fi

STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")

# Clean up cores here so that just killed processes' core files are ignored
cores="$HOME"/cores
rm -rf "$cores" parallel-workload-queries.log parallel-workload-queries.log.zst
mkdir -m 777 "$cores"
# Max 128 characters, so don't use $PWD which will make it too long
# Ignore SIGABRT
sudo sysctl -w kernel.core_pattern="|/usr/bin/ci-filter-core.sh %s $cores/core.%E.%t"

# 3 attempts to download the ci-builder
bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo

# Start dependencies under a different heading so that the main heading is less
# noisy. But not if the service is actually a workflow, in which case it will
# do its own dependency management.

# Don't use `grep -q`! It will stop the `grep` process before mzcompose might
# be finished, thus mzcompose can fail with `write /dev/stdout: broken pipe`.
# Since we have `pipefail` set in this script, this would lead to a failure and
# we would attempt to bring up the workflow, which will fail with `no such
# service: default`.
if ! mzcompose --mz-quiet list-workflows | grep "$service" > /dev/null; then
    ci_collapsed_heading ":docker: Starting dependencies"
    mzcompose up -d --scale "$service=0" "$service"
fi

if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
    ci_uncollapsed_heading ":docker: Fetching binaries for coverage"
    mzcompose create
    mkdir -p coverage/
    chmod 777 coverage/
    # Not all tests contain all of these containers:
    mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/sqllogictest coverage/ || true
    mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/clusterd coverage/ || true
    mzcompose --mz-quiet cp materialized:/usr/local/bin/environmentd coverage/ || true
    mzcompose --mz-quiet cp materialized:/usr/local/bin/clusterd coverage/ || true
    mzcompose --mz-quiet cp testdrive:/usr/local/bin/testdrive coverage/ || true
    mzcompose --mz-quiet cp balancerd:/usr/local/bin/balancerd coverage/ || true
fi

if is_truthy "${CI_HEAP_PROFILES:-}"; then
    (while true; do
        sleep 5
        # faketty because otherwise docker will complain about not being inside
        # of a TTY when run in a background job
        faketty bin/ci-builder run stable bin/ci-upload-heap-profiles "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION"
    done
    ) &
fi

EXTRA_ARGS=$(echo "${CI_EXTRA_ARGS:-}" | jq -r ".[\"$BUILDKITE_STEP_KEY\"] // \"\"")

TEST_CMD=""
if [ "${BUILDKITE_PARALLEL_JOB_COUNT:-1}" -gt 1 ]; then
    TEST_CMD+="BUILDKITE_PARALLEL_JOB=$BUILDKITE_PARALLEL_JOB BUILDKITE_PARALLEL_JOB_COUNT=$BUILDKITE_PARALLEL_JOB_COUNT "
fi
if [ "${CI_SYSTEM_PARAMETERS:-}" = "random" ]; then
    TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS CI_SYSTEM_PARAMETERS_SEED=${CI_SYSTEM_PARAMETERS_SEED:-$BUILDKITE_JOB_ID} "
elif [ "${CI_SYSTEM_PARAMETERS:-}" = "minimal" ]; then
    TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS "
fi
TEST_CMD+="bin/mzcompose --find $BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION run ${run_args[*]} $EXTRA_ARGS"
TEST_DESC="$(mzcompose description)"

ci_uncollapsed_heading ":docker: Running \`$TEST_CMD\`"
echo "$TEST_DESC"

cleanup() {
  # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves
  START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s)
  END_TIME=$(date +%s)
  ELAPSED=$((END_TIME - START_TIME))
  if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then
    printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log
  fi

  echo "--- Post command steps"
  # Run before potential "run down" in coverage
  docker ps --all --quiet | xargs --no-run-if-empty docker inspect | jq '
    .[]
    | .Config.Env = ["[REDACTED]"]
    | .Config.Cmd = ["[REDACTED]"]
    | .Config.Entrypoint = ["[REDACTED]"]
    | .Args = ["[REDACTED]"]' > docker-inspect.log
  # services.log might already exist and contain logs from before composition was downed
  time=0
  if [ -f services.log ]; then
      # Don't capture log lines we received already
      time=$(date +%s -r services.log)
  fi
  mzcompose logs --no-color --timestamps --since "$time" >> services.log
  # Sort services.log and remove the timestamps we added to prevent having duplicate timestamps in output. For reference:
  # https://github.com/moby/moby/issues/33673
  # https://github.com/moby/moby/issues/31706
  sort -t"|" -k2 < services.log | sed -E "s/ \| [0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]\:[0-5][0-9]:[0-6][0-9]\.[0-9]{9}Z / \| /" > services-sorted.log
  mv services-sorted.log services.log
  # shellcheck disable=SC2024
  sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log
  netstat -ant > netstat-ant.log
  netstat -panelot > netstat-panelot.log
  ps aux | sed -E "s/\S*mzp_\S*/[REDACTED]/g" > ps-aux.log
  docker stats --all --no-stream > docker-stats.log

  mv "$cores" . || true

  if find cores -name 'core.*' | grep -q .; then
      # Best effort attempt to fetch interesting executables to get backtrace of core files
      mzcompose cp sqllogictest:/usr/local/bin/sqllogictest cores/ || true
      mzcompose cp sqllogictest:/usr/local/bin/clusterd cores/ || true
      mzcompose cp materialized:/usr/local/bin/environmentd cores/ || true
      mzcompose cp materialized:/usr/local/bin/clusterd cores/ || true
      mzcompose cp materialized:/usr/local/bin/materialized cores/ || true
      mzcompose cp balancerd:/usr/local/bin/balancerd cores/ || true
      mzcompose cp testdrive:/usr/local/bin/testdrive cores/ || true
  fi

  echo "Downing docker containers"
  mzcompose down --volumes || true  # Ignore failures, we still want the rest of the cleanup

  echo "Finding core files"
  find cores -name 'core.*' | while read -r core; do
      exe=$(echo "$core" | sed -e "s/core\.\(.*\)\.[0-9]*/\1/" -e "s/.*\!//")
      # Core dumps can take a while to be written, so if extracting the info fails, try again later
      bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || (sleep 2m; bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || true)
      buildkite-agent artifact upload "$core".txt
  done
  # can be huge, clean up
  rm -rf cores

  echo "Compressing parallel-workload-queries.log"
  bin/ci-builder run "$builder" zstd --rm parallel-workload-queries.log || true

  mapfile -t artifacts < <(printf "run.log\nservices.log\njournalctl-merge.log\nnetstat-ant.log\nnetstat-panelot.log\nps-aux.log\ndocker-inspect.log\n"; find . -name 'junit_*.xml'; find mz_debug_* -name '*.log')
  artifacts_str=$(IFS=";"; echo "${artifacts[*]}")

  echo "--- Running trufflehog to scan artifacts for secrets & uploading artifacts"
  {
    bin/ci-builder run "$builder" trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log
  } &

  unset CI_EXTRA_ARGS # We don't want extra args for the annotation
  # Continue even if ci-annotate-errors fails
  CI_ANNOTATE_ERRORS_RESULT=0
  # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts
  {
    buildkite-agent artifact upload "$artifacts_str" || true
  } &
  wait
  echo "--- Annotating errors"
  bin/ci-builder run "$builder" bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-desc="$TEST_DESC" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$?
  buildkite-agent artifact upload "ci-annotate-errors.log"

  export_cov() {
      bin/ci-builder run stable rust-cov export \
        --ignore-filename-regex=.cargo/ \
        --ignore-filename-regex=target/release/ \
        --ignore-filename-regex=/cargo/ \
        --ignore-filename-regex=/mnt/build/ \
        --ignore-filename-regex=/rustc/ \
        --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \
        > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov
  }

  if [ -n "${CI_COVERAGE_ENABLED:-}" ] && [ -z "${BUILDKITE_MZCOMPOSE_PLUGIN_SKIP_COVERAGE:-}" ];  then
      echo "Generating coverage information"
      if [ -n "$(find . -name '*.profraw')" ]; then
          # Workaround for "invalid instrumentation profile data (file header is corrupt)"
          rm -rf profraws
          mkdir profraws
          find . -name '*.profraw' | while read -r i; do
              cp "$i" profraws
              rm "$i"
              bin/ci-builder run stable rust-profdata show profraws/"$(basename "$i")" > /dev/null || rm profraws/"$(basename "$i")"
          done
          find profraws -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} +
          find . -name '*.profraw' -delete

          ARGS=()
          for program in clusterd environmentd balancerd sqllogictest testdrive; do
              if [ -f coverage/"$program" ]; then
                  export_cov coverage/"$program"
                  ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov)
              fi
          done
          rm coverage/"$BUILDKITE_JOB_ID".profdata
          if [ "${#ARGS[@]}" != 0 ]; then
              bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov
              rm coverage/"$BUILDKITE_JOB_ID"-*.lcov
              bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov
              buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst
          fi
      fi
  fi

  ci_unimportant_heading ":docker: Cleaning up after mzcompose"

  # docker-compose kill may fail attempting to kill containers
  # that have just exited on their own because of the
  # "shared-fate" mechanism employed by Mz clusters
  sudo systemctl restart docker
  killall -9 clusterd || true # There might be remaining processes from a cargo-test run
  if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
    find . -name '*.profraw' -delete # Remove remaining profraw files from coverage runs
  fi

  ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin"
  docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes

  if [ "$BUILDKITE_STEP_KEY" = "terraform-aws" ]; then
    mzcompose run aws-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  elif [ "$BUILDKITE_STEP_KEY" = "terraform-aws-upgrade" ]; then
    mzcompose run aws-upgrade --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  elif [ "$BUILDKITE_STEP_KEY" = "terraform-gcp" ]; then
    mzcompose run gcp-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  elif [ "$BUILDKITE_STEP_KEY" = "terraform-azure" ]; then
    mzcompose run azure-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  fi
  rm -rf ~/.kube # Remove potential state from E2E Terraform tests

  echo "Removing mz-debug files"
  find . -maxdepth 1 -type d -name 'mz_debug*' -exec rm -r {} +

  if [ ! -s services.log ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Long single-node Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of txn-wal" ] && [ "$BUILDKITE_LABEL" != "Mz E2E Test" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for DFR)" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for CTF)" ] && [ "$BUILDKITE_LABEL" != "QA Canary Environment Base Load" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Canary Environment" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] && [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]]; then
      echo "+++ services.log is empty, failing"
      exit 1
  fi
  rm -f services.log

  exit "$CI_ANNOTATE_ERRORS_RESULT"
}

trap cleanup EXIT SIGTERM SIGINT

TEST_RESULT=0
# sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view
{ mzcompose run "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$?
if [ "$TEST_RESULT" != "0" ]; then
    # Give the logs some time to log panics, otherwise they might be missing later
    sleep 10
fi