#!/usr/bin/env bash

# Copyright Materialize, Inc. and contributors. All rights reserved.
#
# Use of this software is governed by the Business Source License
# included in the LICENSE file at the root of this repository.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0.

set -euo pipefail

. misc/shlib/shlib.bash
. test/cloudtest/config.bash

run_args=(
    "--junitxml=junit_cloudtest_$BUILDKITE_JOB_ID.xml"
)

kubectl() {
    bin/ci-builder run stable kubectl --context="$K8S_CONTEXT" "$@"
}

export_cov() {
    bin/ci-builder run stable rust-cov export \
      --ignore-filename-regex=.cargo/ \
      --ignore-filename-regex=target/release/ \
      --ignore-filename-regex=/cargo/ \
      --ignore-filename-regex=/mnt/build/ \
      --ignore-filename-regex=/rustc/ \
      --format=lcov "$1" --instr-profile=coverage/"$BUILDKITE_JOB_ID".profdata src/ \
      > coverage/"$BUILDKITE_JOB_ID"-"$(basename "$1")".lcov
}

test_parallelism=true
if read_list BUILDKITE_PLUGIN_CLOUDTEST_ARGS; then
    for arg in "${result[@]}"; do
        if [[ "$arg" == "--no-test-parallelism" ]]; then
            test_parallelism=false
        else
          run_args+=("$arg")
        fi
    done
fi

if [[ "$test_parallelism" == true ]]; then
    run_args+=(
        "--splits=${BUILDKITE_PARALLEL_JOB_COUNT:-1}"
        "--group=$((${BUILDKITE_PARALLEL_JOB:-0}+1))"
    )
fi

STEP_START_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")

ci_collapsed_heading "kind: Increase system limits..."
sudo sysctl fs.inotify.max_user_watches=524288
sudo sysctl fs.inotify.max_user_instances=512

ci_collapsed_heading "kind: Make sure kind is running..."
bin/ci-builder run stable test/cloudtest/setup

# Sometimes build cancellations prevent us from properly cleaning up the last
# cloudtest run, so force a cleanup just in case.
ci_collapsed_heading "kind: Purging state from previous builds..."
bin/ci-builder run stable test/cloudtest/reset
rm -f kubectl-*.log

ci_collapsed_heading "kail: Start a new instance"
NO_COLOR=1 bin/ci-builder run stable --detach --name "kail" kail --context "$K8S_CONTEXT" --log-level info

TEST_CMD="bin/pytest ${run_args[*]}"

ci_uncollapsed_heading "cloudtest: Running \`$TEST_CMD\`"
TEST_RESULT=0

cleanup() {
  echo "--- Post command steps"
  # Buildkite exposes no way to check if a test timed out (and wasn't cancelled manually), so we have to calculate it ourselves
  START_TIME=$(date -d "$STEP_START_TIMESTAMP" +%s)
  END_TIME=$(date +%s)
  ELAPSED=$((END_TIME - START_TIME))
  if [ $ELAPSED -ge $((BUILDKITE_TIMEOUT * 60)) ]; then
    printf "\n%s" "$BUILDKITE_LABEL: test timed out" >> run.log
  fi

  if [ -n "${CI_COVERAGE_ENABLED:-}" ]; then
      ci_uncollapsed_heading "cloudtest: Fetching binaries for coverage"
      mkdir -p coverage/
      chmod 777 coverage/
      kubectl cp environmentd-0:/usr/local/bin/environmentd coverage/environmentd
      kubectl cp environmentd-0:/coverage coverage/
      for pod in $(kubectl get pods -o name | grep -E 'cluster-'); do
        kubectl cp "$pod":/coverage coverage/ || true # Could get deleted
        kubectl cp "$pod":/usr/local/bin/clusterd coverage/clusterd || true
      done

      ci_unimportant_heading "cloudtest: Generate coverage information"
      if [ -n "$(find . -name '*.profraw')" ]; then
          find . -name '*.profraw' -exec bin/ci-builder run stable rust-profdata merge -sparse -o coverage/"$BUILDKITE_JOB_ID".profdata {} +
          find . -name '*.profraw' -delete

          ARGS=()
          for program in clusterd environmentd; do
              if [ -f coverage/"$program" ]; then
                export_cov coverage/"$program"
                ARGS+=("-a" coverage/"$BUILDKITE_JOB_ID"-"$program".lcov)
              fi
          done
          rm coverage/"$BUILDKITE_JOB_ID".profdata
          if [ "${#ARGS[@]}" != 0 ]; then
            bin/ci-builder run stable lcov "${ARGS[@]}" -o coverage/"$BUILDKITE_JOB_ID".lcov
            rm coverage/"$BUILDKITE_JOB_ID"-*.lcov
            bin/ci-builder run stable zstd coverage/"$BUILDKITE_JOB_ID".lcov
            buildkite-agent artifact upload coverage/"$BUILDKITE_JOB_ID".lcov.zst
            rm -rf coverage
          fi
      fi
  fi

  ci_unimportant_heading "cloudtest: Cleaning up mz_debug files from test/cloudtest/test_mz_debug_tool.py"
  find . -type d -name 'mz_debug*' -exec rm -r {} +

  ci_unimportant_heading "kail: Stopping instance..."
  docker logs kail > kail-output.log 2>&1
  docker stop kail

  ci_unimportant_heading "cloudtest: Uploading logs..."
  for pod in $(kubectl get pods -o name | grep -v -E 'kubernetes|minio|cockroach|redpanda'); do
    kubectl logs --prefix=true "$pod" &>> kubectl-get-logs.log || true
    kubectl logs --previous --prefix=true "$pod" &>> kubectl-get-logs-previous.log || true
  done
  kubectl get events > kubectl-get-events.log || true
  kubectl get all > kubectl-get-all.log || true
  kubectl describe all | awk '
    BEGIN { redact=0 }
    /^[[:space:]]*Environment:/ {
      indent = match($0, /[^ ]/) - 1
      print substr($0, 1, indent) "Environment: [REDACTED]"
      redact = 1
      next
    }
    redact {
      current_indent = match($0, /[^ ]/) - 1
      if (current_indent <= indent || NF == 0) {
        redact = 0
      } else {
        next
      }
    }
    { print }
  ' > kubectl-describe-all.log || true
  kubectl get pods -o wide > kubectl-pods-with-nodes.log || true

  kubectl -n kube-system get events > kubectl-get-events-kube-system.log || true
  kubectl -n kube-system get all > kubectl-get-all-kube-system.log || true
  kubectl -n kube-system describe all > kubectl-describe-all-kube-system.log || true

  # shellcheck disable=SC2024
  sudo journalctl --merge --since "$STEP_START_TIMESTAMP" > journalctl-merge.log

  mapfile -t artifacts < <(printf "run.log\nkubectl-get-logs.log\nkubectl-get-logs-previous.log\nkubectl-get-events.log\nkubectl-get-all.log\nkubectl-describe-all.log\nkubectl-pods-with-nodes.log\nkubectl-get-events-kube-system.log\nkubectl-get-all-kube-system.log\nkubectl-describe-all-kube-system.log\njournalctl-merge.log\nkail-output.log\n"; find . -name 'junit_*.xml')

  {
    bin/ci-builder run stable trufflehog --no-update --no-verification --json --exclude-detectors=coda,dockerhub,box,npmtoken,github,snykkey,eightxeight,sumologickey,miro,fmfw,logzio,qase filesystem "${artifacts[@]}" | trufflehog_jq_filter_logs > trufflehog.log
  } &
  artifacts_str=$(IFS=";"; echo "${artifacts[*]}")

  unset CI_EXTRA_ARGS # We don't want extra args for the annotation
  # Continue even if ci-annotate-errors fails
  CI_ANNOTATE_ERRORS_RESULT=0
  # We have to upload artifacts before ci-annotate-errors, so that the annotations can link to the artifacts
  buildkite-agent artifact upload "$artifacts_str" &
  wait
  bin/ci-builder run stable bin/ci-annotate-errors --test-cmd="$TEST_CMD" --test-result="$TEST_RESULT" "${artifacts[@]}" trufflehog.log > ci-annotate-errors.log || CI_ANNOTATE_ERRORS_RESULT=$?
  buildkite-agent artifact upload "ci-annotate-errors.log"

  # File should not be empty, see database-issues#7569
  test -s kubectl-get-logs-previous.log

  ci_unimportant_heading "cloudtest: Resetting..."
  bin/ci-builder run stable test/cloudtest/reset

  ci_collapsed_heading ":docker: Purging all existing docker containers and volumes, regardless of origin"
  sudo systemctl restart docker
  docker ps --all --quiet | xargs --no-run-if-empty docker rm --force --volumes

  exit "$CI_ANNOTATE_ERRORS_RESULT"
}

trap cleanup EXIT SIGTERM SIGINT

# sed command to filter out ANSI command codes in run.log, while keeping them in Buildkite's view
{ stdbuf --output=L --error=L bin/ci-builder run stable bin/pytest "${run_args[@]}" |& tee >(sed -r "s/\x1B\[[0-9;]*[A-Za-z]//g" > run.log); } || TEST_RESULT=$?
if [ "$TEST_RESULT" != "0" ]; then
    # Give the logs some time to log panics, otherwise they might be missing later
    sleep 10
fi