command 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env bash
  2. # Copyright Materialize, Inc. and contributors. All rights reserved.
  3. #
  4. # Use of this software is governed by the Business Source License
  5. # included in the LICENSE file at the root of this repository.
  6. #
  7. # As of the Change Date specified in that file, in accordance with
  8. # the Business Source License, use of this software will be governed
  9. # by the Apache License, Version 2.0.
  10. set -euo pipefail
  11. . misc/shlib/shlib.bash
  12. builder=${BUILDKITE_PLUGIN_MZCOMPOSE_CI_BUILDER:-min}
  13. mzcompose() {
  14. stdbuf --output=L --error=L bin/ci-builder run "$builder" bin/mzcompose --find "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION" "$@"
  15. }
  16. faketty() {
  17. script -qfc "$(printf "%q " "$@")" /dev/null
  18. }
  19. service=${BUILDKITE_PLUGIN_MZCOMPOSE_RUN:-default}
  20. run_args=("$service")
  21. if read_list BUILDKITE_PLUGIN_MZCOMPOSE_ARGS; then
  22. for arg in "${result[@]}"; do
  23. run_args+=("$arg")
  24. done
  25. fi
  26. STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
  27. # Clean up cores here so that just killed processes' core files are ignored
  28. cores="$HOME"/cores
  29. rm -rf "$cores" parallel-workload-queries.log parallel-workload-queries.log.zst
  30. mkdir -m 777 "$cores"
  31. # Max 128 characters, so don't use $PWD which will make it too long
  32. # Ignore SIGABRT
  33. sudo sysctl -w kernel.core_pattern="|/usr/bin/ci-filter-core.sh %s $cores/core.%E.%t"
  34. # 3 attempts to download the ci-builder
  35. bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo || bin/ci-builder run "$builder" echo
  36. # Start dependencies under a different heading so that the main heading is less
  37. # noisy. But not if the service is actually a workflow, in which case it will
  38. # do its own dependency management.
  39. # Don't use `grep -q`! It will stop the `grep` process before mzcompose might
  40. # be finished, thus mzcompose can fail with `write /dev/stdout: broken pipe`.
  41. # Since we have `pipefail` set in this script, this would lead to a failure and
  42. # we would attempt to bring up the workflow, which will fail with `no such
  43. # service: default`.
  44. if ! mzcompose --mz-quiet list-workflows | grep "$service" > /dev/null; then
  45. ci_collapsed_heading ":docker: Starting dependencies"
  46. mzcompose up -d --scale "$service=0" "$service"
  47. fi
  48. if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
  49. ci_uncollapsed_heading ":docker: Fetching binaries for coverage"
  50. mzcompose create
  51. mkdir -p coverage/
  52. chmod 777 coverage/
  53. # Not all tests contain all of these containers:
  54. mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/sqllogictest coverage/ || true
  55. mzcompose --mz-quiet cp sqllogictest:/usr/local/bin/clusterd coverage/ || true
  56. mzcompose --mz-quiet cp materialized:/usr/local/bin/environmentd coverage/ || true
  57. mzcompose --mz-quiet cp materialized:/usr/local/bin/clusterd coverage/ || true
  58. mzcompose --mz-quiet cp testdrive:/usr/local/bin/testdrive coverage/ || true
  59. mzcompose --mz-quiet cp balancerd:/usr/local/bin/balancerd coverage/ || true
  60. fi
  61. if is_truthy "${CI_HEAP_PROFILES:-}"; then
  62. (while true; do
  63. sleep 5
  64. # faketty because otherwise docker will complain about not being inside
  65. # of a TTY when run in a background job
  66. faketty bin/ci-builder run stable bin/ci-upload-heap-profiles "$BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION"
  67. done
  68. ) &
  69. fi
  70. EXTRA_ARGS=$(echo "${CI_EXTRA_ARGS:-}" | jq -r ".[\"$BUILDKITE_STEP_KEY\"] // \"\"")
  71. TEST_CMD=""
  72. if [ "${BUILDKITE_PARALLEL_JOB_COUNT:-1}" -gt 1 ]; then
  73. TEST_CMD+="BUILDKITE_PARALLEL_JOB=$BUILDKITE_PARALLEL_JOB BUILDKITE_PARALLEL_JOB_COUNT=$BUILDKITE_PARALLEL_JOB_COUNT "
  74. fi
  75. if [ "${CI_SYSTEM_PARAMETERS:-}" = "random" ]; then
  76. TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS CI_SYSTEM_PARAMETERS_SEED=${CI_SYSTEM_PARAMETERS_SEED:-$BUILDKITE_JOB_ID} "
  77. elif [ "${CI_SYSTEM_PARAMETERS:-}" = "minimal" ]; then
  78. TEST_CMD+="CI_SYSTEM_PARAMETERS=$CI_SYSTEM_PARAMETERS "
  79. fi
  80. TEST_CMD+="bin/mzcompose --find $BUILDKITE_PLUGIN_MZCOMPOSE_COMPOSITION run ${run_args[*]} $EXTRA_ARGS"
  81. TEST_DESC="$(mzcompose description)"
  82. ci_uncollapsed_heading ":docker: Running \`$TEST_CMD\`"
  83. echo "$TEST_DESC"
  84. cleanup() {
  85. # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves
  86. START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s)
  87. END_TIME=$(date +%s)
  88. ELAPSED=$((END_TIME - START_TIME))
  89. if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then
  90. printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log
  91. fi
  92. echo "--- Post command steps"
  93. # Run before potential "run down" in coverage
  94. docker ps --all --quiet | xargs --no-run-if-empty docker inspect | jq '
  95. .[]
  96. | .Config.Env = ["[REDACTED]"]
  97. | .Config.Cmd = ["[REDACTED]"]
  98. | .Config.Entrypoint = ["[REDACTED]"]
  99. | .Args = ["[REDACTED]"]' > docker-inspect.log
  100. # services.log might already exist and contain logs from before composition was downed
  101. time=0
  102. if [ -f services.log ]; then
  103. # Don't capture log lines we received already
  104. time=$(date +%s -r services.log)
  105. fi
  106. mzcompose logs --no-color --timestamps --since "$time" >> services.log
  107. # Sort services.log and remove the timestamps we added to prevent having duplicate timestamps in output. For reference:
  108. # https://github.com/moby/moby/issues/33673
  109. # https://github.com/moby/moby/issues/31706
  110. sort -t"|" -k2 < services.log | sed -E "s/ \| [0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]\:[0-5][0-9]:[0-6][0-9]\.[0-9]{9}Z / \| /" > services-sorted.log
  111. mv services-sorted.log services.log
  112. # shellcheck disable=SC2024
  113. sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log
  114. netstat -ant > netstat-ant.log
  115. netstat -panelot > netstat-panelot.log
  116. ps aux | sed -E "s/\S*mzp_\S*/[REDACTED]/g" > ps-aux.log
  117. docker stats --all --no-stream > docker-stats.log
  118. mv "$cores" . || true
  119. if find cores -name 'core.*' | grep -q .; then
  120. # Best effort attempt to fetch interesting executables to get backtrace of core files
  121. mzcompose cp sqllogictest:/usr/local/bin/sqllogictest cores/ || true
  122. mzcompose cp sqllogictest:/usr/local/bin/clusterd cores/ || true
  123. mzcompose cp materialized:/usr/local/bin/environmentd cores/ || true
  124. mzcompose cp materialized:/usr/local/bin/clusterd cores/ || true
  125. mzcompose cp materialized:/usr/local/bin/materialized cores/ || true
  126. mzcompose cp balancerd:/usr/local/bin/balancerd cores/ || true
  127. mzcompose cp testdrive:/usr/local/bin/testdrive cores/ || true
  128. fi
  129. echo "Downing docker containers"
  130. mzcompose down --volumes || true # Ignore failures, we still want the rest of the cleanup
  131. echo "Finding core files"
  132. find cores -name 'core.*' | while read -r core; do
  133. exe=$(echo "$core" | sed -e "s/core\.\(.*\)\.[0-9]*/\1/" -e "s/.*\!//")
  134. # Core dumps can take a while to be written, so if extracting the info fails, try again later
  135. bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || (sleep 2m; bin/ci-builder run "$builder" gdb --batch -ex "bt full" -ex "thread apply all bt" -ex "quit" cores/"$exe" "$core" > "$core".txt || true)
  136. buildkite-agent artifact upload "$core".txt
  137. done
  138. # can be huge, clean up
  139. rm -rf cores
  140. echo "Compressing parallel-workload-queries.log"
  141. bin/ci-builder run "$builder" zstd --rm parallel-workload-queries.log || true
  142. mapfile -t artifacts < <(printf "run.log\nservices.log\njournalctl-merge.log\nnetstat-ant.log\nnetstat-panelot.log\nps-aux.log\ndocker-inspect.log\n"; find . -name 'junit_*.xml'; find mz_debug_* -name '*.log')
  143. artifacts_str=$(IFS=";"; echo "${artifacts[*]}")
  144. echo "--- Running trufflehog to scan artifacts for secrets & uploading artifacts"
  145. {
  146. bin/ci-builder run "$builder" trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log
  147. } &
  148. unset CI_EXTRA_ARGS # We don't want extra args for the annotation
  149. # Continue even if ci-annotate-errors fails
  150. CI_ANNOTATE_ERRORS_RESULT=0
  151. # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts
  152. {
  153. buildkite-agent artifact upload "$artifacts_str" || true
  154. } &
  155. wait
  156. echo "--- Annotating errors"
  157. bin/ci-builder run "$builder" bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-desc="$TEST_DESC" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$?
  158. buildkite-agent artifact upload "ci-annotate-errors.log"
  159. export_cov() {
  160. bin/ci-builder run stable rust-cov export \
  161. --ignore-filename-regex=.cargo/ \
  162. --ignore-filename-regex=target/release/ \
  163. --ignore-filename-regex=/cargo/ \
  164. --ignore-filename-regex=/mnt/build/ \
  165. --ignore-filename-regex=/rustc/ \
  166. --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \
  167. > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov
  168. }
  169. if [ -n "${CI_COVERAGE_ENABLED:-}" ] && [ -z "${BUILDKITE_MZCOMPOSE_PLUGIN_SKIP_COVERAGE:-}" ]; then
  170. echo "Generating coverage information"
  171. if [ -n "$(find . -name '*.profraw')" ]; then
  172. # Workaround for "invalid instrumentation profile data (file header is corrupt)"
  173. rm -rf profraws
  174. mkdir profraws
  175. find . -name '*.profraw' | while read -r i; do
  176. cp "$i" profraws
  177. rm "$i"
  178. bin/ci-builder run stable rust-profdata show profraws/"$(basename "$i")" > /dev/null || rm profraws/"$(basename "$i")"
  179. done
  180. find profraws -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} +
  181. find . -name '*.profraw' -delete
  182. ARGS=()
  183. for program in clusterd environmentd balancerd sqllogictest testdrive; do
  184. if [ -f coverage/"$program" ]; then
  185. export_cov coverage/"$program"
  186. ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov)
  187. fi
  188. done
  189. rm coverage/"$BUILDKITE_JOB_ID".profdata
  190. if [ "${#ARGS[@]}" != 0 ]; then
  191. bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov
  192. rm coverage/"$BUILDKITE_JOB_ID"-*.lcov
  193. bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov
  194. buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst
  195. fi
  196. fi
  197. fi
  198. ci_unimportant_heading ":docker: Cleaning up after mzcompose"
  199. # docker-compose kill may fail attempting to kill containers
  200. # that have just exited on their own because of the
  201. # "shared-fate" mechanism employed by Mz clusters
  202. sudo systemctl restart docker
  203. killall -9 clusterd || true # There might be remaining processes from a cargo-test run
  204. if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
  205. find . -name '*.profraw' -delete # Remove remaining profraw files from coverage runs
  206. fi
  207. ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin"
  208. docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes
  209. if [ "$BUILDKITE_STEP_KEY" = "terraform-aws" ]; then
  210. mzcompose run aws-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  211. elif [ "$BUILDKITE_STEP_KEY" = "terraform-aws-upgrade" ]; then
  212. mzcompose run aws-upgrade --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  213. elif [ "$BUILDKITE_STEP_KEY" = "terraform-gcp" ]; then
  214. mzcompose run gcp-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  215. elif [ "$BUILDKITE_STEP_KEY" = "terraform-azure" ]; then
  216. mzcompose run azure-temporary --no-setup --no-test --no-run-mz-debug || CI_ANNOTATE_ERRORS_RESULT=1
  217. fi
  218. rm -rf ~/.kube # Remove potential state from E2E Terraform tests
  219. echo "Removing mz-debug files"
  220. find . -maxdepth 1 -type d -name 'mz_debug*' -exec rm -r {} +
  221. if [ ! -s services.log ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Long single-node Maelstrom coverage of persist" ] && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of txn-wal" ] && [ "$BUILDKITE_LABEL" != "Mz E2E Test" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for DFR)" ] && [ "$BUILDKITE_LABEL" != "Output consistency (version for CTF)" ] && [ "$BUILDKITE_LABEL" != "QA Canary Environment Base Load" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Canary Environment" ] && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] && [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]]; then
  222. echo "+++ services.log is empty, failing"
  223. exit 1
  224. fi
  225. rm -f services.log
  226. exit "$CI_ANNOTATE_ERRORS_RESULT"
  227. }
  228. trap cleanup EXIT SIGTERM SIGINT
  229. TEST_RESULT=0
  230. # sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view
  231. { mzcompose run "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$?
  232. if [ "$TEST_RESULT" != "0" ]; then
  233. # Give the logs some time to log panics, otherwise they might be missing later
  234. sleep 10
  235. fi