command 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #!/usr/bin/env bash
  2. # Copyright Materialize, Inc. and contributors. All rights reserved.
  3. #
  4. # Use of this software is governed by the Business Source License
  5. # included in the LICENSE file at the root of this repository.
  6. #
  7. # As of the Change Date specified in that file, in accordance with
  8. # the Business Source License, use of this software will be governed
  9. # by the Apache License, Version 2.0.
  10. set -euo pipefail
  11. . misc/shlib/shlib.bash
  12. . test/cloudtest/config.bash
  13. run_args=(
  14. "--junitxml=junit_cloudtest_$BUILDKITE_JOB_ID.xml"
  15. )
  16. kubectl() {
  17. bin/ci-builder run stable kubectl --context="$K8S_CONTEXT" "$@"
  18. }
  19. export_cov() {
  20. bin/ci-builder run stable rust-cov export \
  21. --ignore-filename-regex=.cargo/ \
  22. --ignore-filename-regex=target/release/ \
  23. --ignore-filename-regex=/cargo/ \
  24. --ignore-filename-regex=/mnt/build/ \
  25. --ignore-filename-regex=/rustc/ \
  26. --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \
  27. > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov
  28. }
  29. test_parallelism=true
  30. if read_list BUILDKITE_PLUGIN_CLOUDTEST_ARGS; then
  31. for arg in "${result[@]}"; do
  32. if [[ "$arg" == "--no-test-parallelism" ]]; then
  33. test_parallelism=false
  34. else
  35. run_args+=("$arg")
  36. fi
  37. done
  38. fi
  39. if [[ "$test_parallelism" == true ]]; then
  40. run_args+=(
  41. "--splits=${BUILDKITE_PARALLEL_JOB_COUNT:-1}"
  42. "--group=$((${BUILDKITE_PARALLEL_JOB:-0}+1))"
  43. )
  44. fi
  45. STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
  46. ci_collapsed_heading "kind: Increase system limits..."
  47. sudo sysctl fs.inotify.max_user_watches=524288
  48. sudo sysctl fs.inotify.max_user_instances=512
  49. ci_collapsed_heading "kind: Make sure kind is running..."
  50. bin/ci-builder run stable test/cloudtest/setup
  51. # Sometimes build cancellations prevent us from properly cleaning up the last
  52. # cloudtest run, so force a cleanup just in case.
  53. ci_collapsed_heading "kind: Purging state from previous builds..."
  54. bin/ci-builder run stable test/cloudtest/reset
  55. rm -f kubectl-*.log
  56. ci_collapsed_heading "kail: Start a new instance"
  57. NO_COLOR=1 bin/ci-builder run stable --detach --name "kail" kail --context "$K8S_CONTEXT" --log-level info
  58. TEST_CMD="bin/pytest ${run_args[*]}"
  59. ci_uncollapsed_heading "cloudtest: Running \`$TEST_CMD\`"
  60. TEST_RESULT=0
  61. cleanup() {
  62. echo "--- Post command steps"
  63. # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves
  64. START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s)
  65. END_TIME=$(date +%s)
  66. ELAPSED=$((END_TIME - START_TIME))
  67. if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then
  68. printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log
  69. fi
  70. if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
  71. ci_uncollapsed_heading "cloudtest: Fetching binaries for coverage"
  72. mkdir -p coverage/
  73. chmod 777 coverage/
  74. kubectl cp environmentd-0:/usr/local/bin/environmentd coverage/environmentd
  75. kubectl cp environmentd-0:/coverage coverage/
  76. for pod in $(kubectl get pods -o name | grep -E 'cluster-'); do
  77. kubectl cp "$pod":/coverage coverage/ || true # Could get deleted
  78. kubectl cp "$pod":/usr/local/bin/clusterd coverage/clusterd || true
  79. done
  80. ci_unimportant_heading "cloudtest: Generate coverage information"
  81. if [ -n "$(find . -name '*.profraw')" ]; then
  82. find . -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} +
  83. find . -name '*.profraw' -delete
  84. ARGS=()
  85. for program in clusterd environmentd; do
  86. if [ -f coverage/"$program" ]; then
  87. export_cov coverage/"$program"
  88. ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov)
  89. fi
  90. done
  91. rm coverage/"$BUILDKITE_JOB_ID".profdata
  92. if [ "${#ARGS[@]}" != 0 ]; then
  93. bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov
  94. rm coverage/"$BUILDKITE_JOB_ID"-*.lcov
  95. bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov
  96. buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst
  97. rm -rf coverage
  98. fi
  99. fi
  100. fi
  101. ci_unimportant_heading "cloudtest: Cleaning up mz_debug files from test/cloudtest/test_mz_debug_tool.py"
  102. find . -type d -name 'mz_debug*' -exec rm -r {} +
  103. ci_unimportant_heading "kail: Stopping instance..."
  104. docker logs kail > kail-output.log 2>&1
  105. docker stop kail
  106. ci_unimportant_heading "cloudtest: Uploading logs..."
  107. for pod in $(kubectl get pods -o name | grep -v -E 'kubernetes|minio|cockroach|redpanda'); do
  108. kubectl logs --prefix=true "$pod" &>> kubectl-get-logs.log || true
  109. kubectl logs --previous --prefix=true "$pod" &>> kubectl-get-logs-previous.log || true
  110. done
  111. kubectl get events > kubectl-get-events.log || true
  112. kubectl get all > kubectl-get-all.log || true
  113. kubectl describe all | awk '
  114. BEGIN { redact=0 }
  115. /^[[:space:]]*Environment:/ {
  116. indent = match($0, /[^ ]/) - 1
  117. print substr($0, 1, indent) "Environment: [REDACTED]"
  118. redact = 1
  119. next
  120. }
  121. redact {
  122. current_indent = match($0, /[^ ]/) - 1
  123. if (current_indent <= indent || NF == 0) {
  124. redact = 0
  125. } else {
  126. next
  127. }
  128. }
  129. { print }
  130. ' > kubectl-describe-all.log || true
  131. kubectl get pods -o wide > kubectl-pods-with-nodes.log || true
  132. kubectl -n kube-system get events > kubectl-get-events-kube-system.log || true
  133. kubectl -n kube-system get all > kubectl-get-all-kube-system.log || true
  134. kubectl -n kube-system describe all > kubectl-describe-all-kube-system.log || true
  135. # shellcheck disable=SC2024
  136. sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log
  137. mapfile -t artifacts < <(printf "run.log\nkubectl-get-logs.log\nkubectl-get-logs-previous.log\nkubectl-get-events.log\nkubectl-get-all.log\nkubectl-describe-all.log\nkubectl-pods-with-nodes.log\nkubectl-get-events-kube-system.log\nkubectl-get-all-kube-system.log\nkubectl-describe-all-kube-system.log\njournalctl-merge.log\nkail-output.log\n"; find . -name 'junit_*.xml')
  138. {
  139. bin/ci-builder run stable trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log
  140. } &
  141. artifacts_str=$(IFS=";"; echo "${artifacts[*]}")
  142. unset CI_EXTRA_ARGS # We don't want extra args for the annotation
  143. # Continue even if ci-annotate-errors fails
  144. CI_ANNOTATE_ERRORS_RESULT=0
  145. # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts
  146. buildkite-agent artifact upload "$artifacts_str" &
  147. wait
  148. bin/ci-builder run stable bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$?
  149. buildkite-agent artifact upload "ci-annotate-errors.log"
  150. # File should not be empty, see database-issues#7569
  151. test -s kubectl-get-logs-previous.log
  152. ci_unimportant_heading "cloudtest: Resetting..."
  153. bin/ci-builder run stable test/cloudtest/reset
  154. ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin"
  155. sudo systemctl restart docker
  156. docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes
  157. exit "$CI_ANNOTATE_ERRORS_RESULT"
  158. }
  159. trap cleanup EXIT SIGTERM SIGINT
  160. # sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view
  161. { stdbuf --output=L --error=L bin/ci-builder run stable bin/pytest "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$?
  162. if [ "$TEST_RESULT" != "0" ]; then
  163. # Give the logs some time to log panics, otherwise they might be missing later
  164. sleep 10
  165. fi