ci_annotate_errors.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. #
  10. # ci_annotate_errors.py - Detect errors in junit xml as well as log files
  11. # during CI and find associated open GitHub issues in Materialize repository.
  12. import argparse
  13. import json
  14. import mmap
  15. import os
  16. import re
  17. import sys
  18. import traceback
  19. import urllib.parse
  20. from collections.abc import Sequence
  21. from concurrent.futures import ThreadPoolExecutor
  22. from dataclasses import dataclass, field
  23. from itertools import chain
  24. from textwrap import dedent
  25. from typing import Any
  26. from xml.etree.ElementTree import ParseError
  27. from junitparser.junitparser import Error, Failure, JUnitXml
  28. from materialize import ci_util, ui
  29. from materialize.buildkite import (
  30. add_annotation_raw,
  31. get_artifact_url,
  32. )
  33. from materialize.buildkite_insights.buildkite_api import builds_api, generic_api
  34. from materialize.buildkite_insights.buildkite_api.buildkite_constants import (
  35. BUILDKITE_RELEVANT_COMPLETED_BUILD_STEP_STATES,
  36. )
  37. from materialize.buildkite_insights.data.build_history import (
  38. BuildHistory,
  39. BuildHistoryEntry,
  40. )
  41. from materialize.buildkite_insights.data.build_step import BuildStepMatcher
  42. from materialize.buildkite_insights.util.build_step_utils import (
  43. extract_build_step_outcomes,
  44. )
  45. from materialize.cli.mzcompose import JUNIT_ERROR_DETAILS_SEPARATOR
  46. from materialize.github import (
  47. KnownGitHubIssue,
  48. for_github_re,
  49. get_known_issues_from_github,
  50. )
  51. from materialize.observed_error import ObservedBaseError, WithIssue
  52. from materialize.test_analytics.config.test_analytics_db_config import (
  53. create_test_analytics_config_with_hostname,
  54. )
  55. from materialize.test_analytics.connector.test_analytics_connector import (
  56. TestAnalyticsUploadError,
  57. )
  58. from materialize.test_analytics.data.build_annotation import build_annotation_storage
  59. from materialize.test_analytics.data.build_annotation.build_annotation_storage import (
  60. AnnotationErrorEntry,
  61. )
  62. from materialize.test_analytics.test_analytics_db import TestAnalyticsDb
  63. # Unexpected failures, report them
  64. ERROR_RE = re.compile(
  65. rb"""
  66. ^ .*
  67. ( segfault\ at
  68. | trap\ invalid\ opcode
  69. | general\ protection
  70. | has\ overflowed\ its\ stack
  71. | internal\ error:
  72. | \*\ FATAL:
  73. | was\ provided\ more\ than\ once,\ but\ cannot\ be\ used\ multiple\ times
  74. | (^|\ )fatal: # used in frontegg-mock
  75. | [Oo]ut\ [Oo]f\ [Mm]emory
  76. | memory\ allocation\ of\ [0-9]+\ bytes\ failed
  77. | cannot\ migrate\ from\ catalog
  78. | halting\ process: # Rust unwrap
  79. | fatal\ runtime\ error: # stack overflow
  80. | \[SQLsmith\] # Unknown errors are logged
  81. | \[SQLancer\] # Unknown errors are logged
  82. | environmentd:\ fatal: # startup failure
  83. | clusterd:\ fatal: # startup failure
  84. | error:\ Found\ argument\ '.*'\ which\ wasn't\ expected,\ or\ isn't\ valid\ in\ this\ context
  85. | environmentd\ .*\ unrecognized\ configuration\ parameter
  86. | cannot\ load\ unknown\ system\ parameter\ from\ catalog\ storage
  87. | SUMMARY:\ .*Sanitizer
  88. | primary\ source\ \w+\ seemingly\ dropped\ before\ subsource
  89. | :\ test\ timed\ out
  90. # Only notifying on unexpected failures. INT, TRAP, BUS, FPE, SEGV, PIPE
  91. | \ ANOM_ABEND\ .*\ sig=(2|5|7|8|11|13)
  92. # \s\S is any character including newlines, so this matches multiline strings
  93. # non-greedy using ? so that we don't match all the result comparison issues into one block
  94. | ----------\ RESULT\ COMPARISON\ ISSUE\ START\ ----------[\s\S]*?----------\ RESULT\ COMPARISON\ ISSUE\ END\ ------------
  95. # output consistency tests
  96. # | possibly\ invalid\ operation\ specification # disabled
  97. # for miri test summary
  98. | (FAIL|TIMEOUT)\s+\[\s*\d+\.\d+s\]
  99. # parallel-workload
  100. | worker_.*\ still\ running: [\s\S]* Threads\ have\ not\ stopped\ within\ 5\ minutes,\ exiting\ hard
  101. # source-table migration
  102. | source-table-migration\ issue
  103. )
  104. .* $
  105. """,
  106. re.VERBOSE | re.MULTILINE,
  107. )
  108. # Panics are multiline and our log lines of multiple services are interleaved,
  109. # making them complex to handle in regular expressions, thus handle them
  110. # separately.
  111. # Example 1: launchdarkly-materialized-1 | 2025-02-08T16:40:57.296144Z thread 'coordinator' panicked at [...]
  112. # Example 2: [pod/environmentd-0/environmentd] 2025-02-08T16:40:57.296144Z thread 'coordinator' panicked at [...]
  113. PANIC_IN_SERVICE_START_RE = re.compile(
  114. rb"^(\[)?(?P<service>[^ ]*)(\s*\||\]) \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z thread '.*' panicked at "
  115. )
  116. TIMESTAMP_IN_PANIC_RE = re.compile(rb" \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z ")
  117. JOURNALCTL_LOG_LINE_RE = re.compile(
  118. rb"^[A-Z][a-z]{2} \d{2} \d{2}:\d{2}:\d{2} [^\s]+ (?P<msg>.+)"
  119. )
  120. # Example 1: launchdarkly-materialized-1 | global timestamp must always go up
  121. # Example 2: [pod/environmentd-0/environmentd] Unknown collection identifier u2082
  122. SERVICES_LOG_LINE_RE = re.compile(rb"^(\[)?(?P<service>[^ ]*)(\s*\||\]) (?P<msg>.*)$")
  123. # Expected failures, don't report them
  124. IGNORE_RE = re.compile(
  125. rb"""
  126. # Expected in restart test
  127. ( restart-materialized-1\ \ \|\ thread\ 'coordinator'\ panicked\ at\ 'can't\ persist\ timestamp
  128. # Expected in restart test
  129. | restart-materialized-1\ *|\ thread\ 'coordinator'\ panicked\ at\ 'external\ operation\ .*\ failed\ unrecoverably.*
  130. # Expected in cluster test
  131. | cluster-clusterd[12]-1\ .*\ halting\ process:\ new\ timely\ configuration\ does\ not\ match\ existing\ timely\ configuration
  132. | cluster-clusterd1-1\ .*\ replica\ expired
  133. # Emitted by tests employing explicit mz_panic()
  134. | forced\ panic
  135. # Emitted by broken_statements.slt in order to stop panic propagation, as 'forced panic' will unwantedly panic the `environmentd` thread.
  136. | forced\ optimizer\ panic
  137. # Expected once compute cluster has panicked, brings no new information
  138. | timely\ communication\ error:
  139. # Expected once compute cluster has panicked, only happens in CI
  140. | aborting\ because\ propagate_crashes\ is\ enabled
  141. # Expected when CRDB is corrupted
  142. | restart-materialized-1\ .*relation\ \\"fence\\"\ does\ not\ exist
  143. # Expected when CRDB is corrupted
  144. | restart-materialized-1\ .*relation\ "consensus"\ does\ not\ exist
  145. # Will print a separate panic line which will be handled and contains the relevant information (new style)
  146. | internal\ error:\ unexpected\ panic\ during\ query\ optimization
  147. # redpanda INFO logging
  148. | [Ll]arger\ sizes\ prevent\ running\ out\ of\ memory
  149. # Old versions won't support new parameters
  150. | (platform-checks|legacy-upgrade|upgrade-matrix|feature-benchmark)-materialized-.* \| .*cannot\ load\ unknown\ system\ parameter\ from\ catalog\ storage
  151. # Fencing warnings are OK in fencing/0dt tests
  152. | (txn-wal-fencing-mz_first-|platform-checks-mz_|parallel-workload-|data-ingest-|zippy-|legacy-upgrade-).* \| .*unexpected\ fence\ epoch
  153. | (txn-wal-fencing-mz_first-|platform-checks-mz_|parallel-workload-|data-ingest-|zippy-|legacy-upgrade-).* \| .*fenced\ by\ new\ catalog
  154. | (txn-wal-fencing-mz_first-|platform-checks-mz_|parallel-workload-|data-ingest-|zippy-|legacy-upgrade-).* \| .*starting\ catalog\ transaction:\ Durable\(Fence\(Epoch
  155. | (txn-wal-fencing-mz_first-|platform-checks-mz_|parallel-workload-|data-ingest-|zippy-|legacy-upgrade-).* \| .*fenced\ by\ envd
  156. | internal\ error:\ no\ AWS\ external\ ID\ prefix\ configured
  157. # For platform-checks upgrade tests
  158. | platform-checks-.* \| .* received\ persist\ state\ from\ the\ future
  159. | cannot\ load\ unknown\ system\ parameter\ from\ catalog\ storage(\ to\ set\ (default|configured)\ parameter)?
  160. # For tests we purposely trigger this error
  161. | skip-version-upgrade-materialized.* \| .* incompatible\ persist\ version\ \d+\.\d+\.\d+(-dev)?,\ current:\ \d+\.\d+\.\d+(-dev\.\d+)?,\ make\ sure\ to\ upgrade\ the\ catalog\ one\ version\ forward\ at\ a\ time
  162. # For 0dt upgrades
  163. | halting\ process:\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out|dependency\ since\ frontier\ is\ empty\ while\ dependent\ upper\ is\ not\ empty)
  164. | zippy-materialized.* \| .* halting\ process:\ Server\ started\ with\ requested\ generation
  165. | there\ have\ been\ DDL\ that\ we\ need\ to\ react\ to;\ rebooting\ in\ read-only\ mode
  166. # Don't care for ssh problems
  167. | fatal:\ userauth_pubkey
  168. # Expected in Terraform tests if something else failed during the setup
  169. | mz-debug:\ fatal:\ failed\ to\ read\ kubeconfig
  170. # Fences without incrementing deploy generation
  171. | txn-wal-fencing-mz_first-.* \| .* unable\ to\ confirm\ leadership
  172. | txn-wal-fencing-mz_first-.* \| .* fenced\ by\ envd
  173. # 0dt platform-checks have two envds running in parallel, thus high load, tests still succeed, so ignore noise
  174. | platform-checks-mz_.* \| .* was\ expired\ due\ to\ inactivity\.\ Did\ the\ machine\ go\ to\ sleep\?
  175. # This can happen in "K8s recovery: envd on failing node", but the test still succeeds, old environmentd will just be crashed, see database-issues#8749
  176. | \[pod/environmentd-0/environmentd\]\ .*\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
  177. | cannot\ load\ unknown\ system\ parameter
  178. )
  179. """,
  180. re.VERBOSE | re.MULTILINE,
  181. )
  182. # Product limits (finding new limits) intentionally runs into the *actual*
  183. # product limits. The normal IGNORE_RE doesn't allow filtering "out of memory"
  184. # errors specific to a test, since it doesn't contain the test name
  185. PRODUCT_LIMITS_FIND_IGNORE_RE = re.compile(
  186. rb"""
  187. ( Memory\ cgroup\ out\ of\ memory
  188. | [Oo]ut\ [Oo]f\ [Mm]emory
  189. | limits-materialized .* \| .* fatal\ runtime\ error:\ stack\ overflow
  190. | limits-materialized .* \| .* has\ overflowed\ its\ stack
  191. )
  192. """,
  193. re.VERBOSE | re.MULTILINE,
  194. )
  195. @dataclass
  196. class ErrorLog:
  197. match: bytes
  198. file: str
  199. @dataclass
  200. class Secret:
  201. secret: str
  202. file: str
  203. line: int
  204. detector_name: str
  205. @dataclass
  206. class JunitError:
  207. testclass: str
  208. testcase: str
  209. message: str
  210. text: str
  211. @dataclass(kw_only=True, unsafe_hash=True)
  212. class ObservedError(ObservedBaseError):
  213. # abstract class, do not instantiate
  214. error_message: str
  215. error_details: str | None = None
  216. additional_collapsed_error_details_header: str | None = None
  217. additional_collapsed_error_details: str | None = None
  218. error_type: str
  219. location: str
  220. location_url: str | None = None
  221. max_error_length: int = 10000
  222. max_details_length: int = 10000
  223. occurrences: int = 1
  224. def error_message_as_markdown(self) -> str:
  225. return format_message_as_code_block(self.error_message, self.max_error_length)
  226. def error_details_as_markdown(self) -> str:
  227. if self.error_details is None:
  228. return ""
  229. return f"\n{format_message_as_code_block(self.error_details, self.max_details_length)}"
  230. def error_message_as_text(self) -> str:
  231. return crop_text(self.error_message, self.max_error_length)
  232. def error_details_as_text(self) -> str:
  233. if self.error_details is None:
  234. return ""
  235. return f"\n{crop_text(self.error_details, self.max_details_length)}"
  236. def location_as_markdown(self) -> str:
  237. if self.location_url is None:
  238. return self.location
  239. else:
  240. return f'<a href="{self.location_url}">{self.location}</a>'
  241. def additional_collapsed_error_details_as_markdown(self) -> str:
  242. if self.additional_collapsed_error_details is None:
  243. return ""
  244. assert self.additional_collapsed_error_details_header is not None
  245. return (
  246. "\n"
  247. + dedent(
  248. f"""
  249. <details>
  250. <summary>{self.additional_collapsed_error_details_header}</summary>
  251. <pre>{self.additional_collapsed_error_details}</pre>
  252. </details>
  253. """
  254. ).strip()
  255. + "\n\n"
  256. )
  257. @dataclass(kw_only=True, unsafe_hash=True)
  258. class ObservedErrorWithIssue(ObservedError, WithIssue):
  259. issue_is_closed: bool
  260. issue_ignore_failure: bool
  261. def _get_issue_presentation(self) -> str:
  262. issue_presentation = f"#{self.issue_number}"
  263. if self.issue_is_closed:
  264. issue_presentation = f"{issue_presentation}, closed"
  265. return issue_presentation
  266. def to_text(self) -> str:
  267. return f"{self.error_type} {self.issue_title} ({self._get_issue_presentation()}) in {self.location}: {self.error_message_as_text()}{self.error_details_as_text()}"
  268. def to_markdown(self) -> str:
  269. filters = [{"id": "issue", "value": f"database-issues/{self.issue_number} "}]
  270. ci_failures_url = f"https://ci-failures.dev.materialize.com/?key=test-failures&tfFilters={urllib.parse.quote(json.dumps(filters), safe='')}"
  271. return f'<a href="{ci_failures_url}">{self.error_type}</a> <a href="{self.issue_url}">{self.issue_title} ({self._get_issue_presentation()})</a> in {self.location_as_markdown()}:\n{self.error_message_as_markdown()}{self.error_details_as_markdown()}{self.additional_collapsed_error_details_as_markdown()}'
  272. @dataclass(kw_only=True, unsafe_hash=True)
  273. class ObservedErrorWithLocation(ObservedError):
  274. def to_text(self) -> str:
  275. return f"{self.error_type} in {self.location}: {self.error_message_as_text()}{self.error_details_as_text()}"
  276. def to_markdown(self) -> str:
  277. filters = [{"id": "content", "value": self.error_message_as_text()[:2000]}]
  278. ci_failures_url = f"https://ci-failures.dev.materialize.com/?key=test-failures&tfFilters={urllib.parse.quote(json.dumps(filters), safe='')}"
  279. return f'<a href="{ci_failures_url}">{self.error_type}</a> in {self.location_as_markdown()}:\n{self.error_message_as_markdown()}{self.error_details_as_markdown()}{self.additional_collapsed_error_details_as_markdown()}'
  280. @dataclass(kw_only=True, unsafe_hash=True)
  281. class FailureInCoverageRun(ObservedError):
  282. def to_text(self) -> str:
  283. return f"{self.location}: {self.error_message_as_text()}"
  284. def to_markdown(self) -> str:
  285. return f"{self.location}:\n{self.error_message_as_markdown()}{self.additional_collapsed_error_details_as_markdown()}"
  286. @dataclass
  287. class Annotation:
  288. suite_name: str
  289. buildkite_job_id: str
  290. is_failure: bool
  291. ignore_failure: bool
  292. build_history_on_main: BuildHistory
  293. test_cmd: str
  294. test_desc: str
  295. unknown_errors: Sequence[ObservedBaseError] = field(default_factory=list)
  296. known_errors: Sequence[ObservedBaseError] = field(default_factory=list)
  297. def to_markdown(self, approx_max_length: int = 900_000) -> str:
  298. only_known_errors = len(self.unknown_errors) == 0 and len(self.known_errors) > 0
  299. no_errors = len(self.unknown_errors) == 0 and len(self.known_errors) == 0
  300. wrap_in_details = only_known_errors
  301. wrap_in_summary = only_known_errors
  302. build_link = f'<a href="#{self.buildkite_job_id}">{self.suite_name}</a>'
  303. outcome = (
  304. ("would have failed" if self.ignore_failure else "failed")
  305. if self.is_failure
  306. else "succeeded"
  307. )
  308. title = f"{build_link} {outcome}"
  309. if only_known_errors:
  310. title = f"{title} with known error logs"
  311. elif no_errors:
  312. title = f"{title}, but no error in logs found"
  313. markdown = title
  314. if self.build_history_on_main.has_entries():
  315. markdown += ", " + self.build_history_on_main.to_markdown()
  316. if wrap_in_summary:
  317. markdown = f"<summary>{markdown}</summary>\n"
  318. def errors_to_markdown(
  319. errors: Sequence[ObservedBaseError], available_length: int
  320. ) -> str:
  321. if len(errors) == 0:
  322. return ""
  323. error_markdown = ""
  324. for error in errors:
  325. if len(error_markdown) > available_length:
  326. error_markdown = error_markdown + "* Further errors exist!\n"
  327. break
  328. error_markdown = error_markdown + (
  329. f"* {error.to_markdown()}{error.occurrences_to_markdown()}\n"
  330. )
  331. return "\n" + error_markdown.strip()
  332. markdown += errors_to_markdown(
  333. self.unknown_errors, approx_max_length - len(markdown)
  334. )
  335. markdown += errors_to_markdown(
  336. self.known_errors, approx_max_length - len(markdown)
  337. )
  338. markdown += f"\n<details><summary>Test details & reproducer</summary>\n{self.test_desc}\n<pre>{self.test_cmd}</pre>\n</details>\n"
  339. if wrap_in_details:
  340. markdown = f"<details>{markdown}\n</details>"
  341. return markdown
  342. def main() -> int:
  343. parser = argparse.ArgumentParser(
  344. prog="ci-annotate-errors",
  345. formatter_class=argparse.RawDescriptionHelpFormatter,
  346. description="""
  347. ci-annotate-errors detects errors in junit xml as well as log files during CI
  348. and finds associated open GitHub issues in Materialize repository.""",
  349. )
  350. parser.add_argument("--cloud-hostname", type=str)
  351. parser.add_argument("--test-cmd", type=str)
  352. parser.add_argument("--test-desc", type=str, default="")
  353. parser.add_argument("--test-result", type=int, default=0)
  354. parser.add_argument("log_files", nargs="+", help="log files to search in")
  355. args = parser.parse_args()
  356. test_analytics_config = create_test_analytics_config_with_hostname(
  357. args.cloud_hostname
  358. )
  359. test_analytics = TestAnalyticsDb(test_analytics_config)
  360. # always insert a build job regardless whether it has annotations or not
  361. test_analytics.builds.add_build_job(was_successful=args.test_result == 0)
  362. number_of_unknown_errors, ignore_failure = annotate_logged_errors(
  363. args.log_files,
  364. test_analytics,
  365. args.test_cmd,
  366. args.test_desc,
  367. args.test_result,
  368. )
  369. try:
  370. test_analytics.submit_updates()
  371. except Exception as e:
  372. if not isinstance(e, TestAnalyticsUploadError):
  373. print(traceback.format_exc())
  374. # An error during an upload must never cause the build to fail
  375. test_analytics.on_upload_failed(e)
  376. if ignore_failure and args.test_result:
  377. print(
  378. "+++ IGNORING TEST FAILURE: Caused by a known issue annotated with `ci-ignore-failure: true`",
  379. file=sys.stderr,
  380. )
  381. return 0
  382. if not args.test_result and number_of_unknown_errors:
  383. print(
  384. "+++ Test succeeded, but unknown errors found in logs, marking as failed",
  385. file=sys.stderr,
  386. )
  387. return 1
  388. return args.test_result
  389. def annotate_errors(
  390. unknown_errors: Sequence[ObservedBaseError],
  391. known_errors: Sequence[ObservedBaseError],
  392. build_history_on_main: BuildHistory,
  393. test_analytics_db: TestAnalyticsDb,
  394. test_cmd: str,
  395. test_desc: str,
  396. test_result: int,
  397. ignore_failure: bool,
  398. ) -> None:
  399. assert len(unknown_errors) > 0 or len(known_errors) > 0
  400. annotation_style = "info" if not unknown_errors else "error"
  401. unknown_errors = group_identical_errors(unknown_errors)
  402. known_errors = group_identical_errors(known_errors)
  403. is_failure = len(unknown_errors) > 0 or test_result != 0
  404. annotation = Annotation(
  405. suite_name=get_suite_name(),
  406. buildkite_job_id=os.getenv("BUILDKITE_JOB_ID", ""),
  407. is_failure=is_failure,
  408. ignore_failure=ignore_failure,
  409. build_history_on_main=build_history_on_main,
  410. unknown_errors=unknown_errors,
  411. known_errors=known_errors,
  412. test_cmd=test_cmd,
  413. test_desc=test_desc,
  414. )
  415. add_annotation_raw(style=annotation_style, markdown=annotation.to_markdown())
  416. store_annotation_in_test_analytics(test_analytics_db, annotation)
  417. def group_identical_errors(
  418. errors: Sequence[ObservedBaseError],
  419. ) -> Sequence[ObservedBaseError]:
  420. errors_with_counts: dict[ObservedBaseError, int] = {}
  421. for error in errors:
  422. errors_with_counts[error] = 1 + errors_with_counts.get(error, 0)
  423. consolidated_errors = []
  424. for error, count in errors_with_counts.items():
  425. error.occurrences = count
  426. consolidated_errors.append(error)
  427. return consolidated_errors
  428. def annotate_logged_errors(
  429. log_files: list[str],
  430. test_analytics: TestAnalyticsDb,
  431. test_cmd: str,
  432. test_desc: str,
  433. test_result: int,
  434. ) -> tuple[int, bool]:
  435. """
  436. Returns the number of unknown errors, 0 when all errors are known or there
  437. were no errors logged as well as whether to ignore the test having failed.
  438. This will be used to fail a test even if the test itself succeeded, as long
  439. as it had any unknown error logs.
  440. """
  441. executor = ThreadPoolExecutor()
  442. artifacts_future = executor.submit(ci_util.get_artifacts)
  443. errors = get_errors(log_files)
  444. if not errors:
  445. return (0, False)
  446. step_key: str = os.getenv("BUILDKITE_STEP_KEY", "")
  447. buildkite_label: str = os.getenv("BUILDKITE_LABEL", "")
  448. token = os.getenv("GITHUB_CI_ISSUE_REFERENCE_CHECKER_TOKEN") or os.getenv(
  449. "GITHUB_TOKEN"
  450. )
  451. (known_issues, issues_with_invalid_regex) = get_known_issues_from_github(token)
  452. unknown_errors: list[ObservedBaseError] = []
  453. unknown_errors.extend(issues_with_invalid_regex)
  454. job = os.getenv("BUILDKITE_JOB_ID")
  455. known_errors: list[ObservedBaseError] = []
  456. ignore_failure: bool = False
  457. # Keep track of known errors so we log each only once
  458. already_reported_issue_numbers: set[int] = set()
  459. def handle_error(
  460. error_message: str,
  461. error_details: str | None,
  462. location: str,
  463. location_url: str | None,
  464. additional_collapsed_error_details_header: str | None = None,
  465. additional_collapsed_error_details: str | None = None,
  466. ):
  467. search_string = error_message.encode()
  468. if error_details is not None:
  469. search_string += ("\n" + error_details).encode()
  470. for issue in known_issues:
  471. match = issue.regex.search(for_github_re(search_string))
  472. if match and issue.info["state"] == "open":
  473. if issue.apply_to and issue.apply_to not in (
  474. step_key.lower(),
  475. buildkite_label.lower().rstrip("01234567889 "),
  476. ):
  477. continue
  478. if issue.location and issue.location != location:
  479. continue
  480. if issue.info["number"] not in already_reported_issue_numbers:
  481. known_errors.append(
  482. ObservedErrorWithIssue(
  483. error_message=error_message,
  484. error_details=error_details,
  485. error_type="Known issue",
  486. internal_error_type="KNOWN_ISSUE",
  487. issue_url=issue.info["html_url"],
  488. issue_title=issue.info["title"],
  489. issue_number=issue.info["number"],
  490. issue_is_closed=False,
  491. issue_ignore_failure=issue.ignore_failure,
  492. location=location,
  493. location_url=location_url,
  494. additional_collapsed_error_details=additional_collapsed_error_details,
  495. additional_collapsed_error_details_header=additional_collapsed_error_details_header,
  496. )
  497. )
  498. already_reported_issue_numbers.add(issue.info["number"])
  499. break
  500. else:
  501. for issue in known_issues:
  502. match = issue.regex.search(for_github_re(search_string))
  503. if match and issue.info["state"] == "closed":
  504. if issue.apply_to and issue.apply_to not in (
  505. step_key.lower(),
  506. buildkite_label.lower(),
  507. ):
  508. continue
  509. if issue.info["number"] not in already_reported_issue_numbers:
  510. unknown_errors.append(
  511. ObservedErrorWithIssue(
  512. error_message=error_message,
  513. error_details=error_details,
  514. error_type="Potential regression",
  515. internal_error_type="POTENTIAL_REGRESSION",
  516. issue_url=issue.info["html_url"],
  517. issue_title=issue.info["title"],
  518. issue_number=issue.info["number"],
  519. issue_is_closed=True,
  520. issue_ignore_failure=False, # Never ignore regressions
  521. location=location,
  522. location_url=location_url,
  523. additional_collapsed_error_details=additional_collapsed_error_details,
  524. additional_collapsed_error_details_header=additional_collapsed_error_details_header,
  525. )
  526. )
  527. already_reported_issue_numbers.add(issue.info["number"])
  528. break
  529. else:
  530. unknown_errors.append(
  531. ObservedErrorWithLocation(
  532. error_message=error_message,
  533. error_details=error_details,
  534. location=location,
  535. location_url=location_url,
  536. error_type="Unknown error",
  537. internal_error_type="UNKNOWN ERROR",
  538. additional_collapsed_error_details=additional_collapsed_error_details,
  539. additional_collapsed_error_details_header=additional_collapsed_error_details_header,
  540. )
  541. )
  542. if errors:
  543. artifacts = artifacts_future.result()
  544. for error in errors:
  545. if isinstance(error, ErrorLog):
  546. for artifact in artifacts:
  547. if artifact["job_id"] == job and artifact["path"] == error.file:
  548. location: str = error.file
  549. location_url = get_artifact_url(artifact)
  550. break
  551. else:
  552. location: str = error.file
  553. location_url = None
  554. handle_error(error.match.decode("utf-8"), None, location, location_url)
  555. elif isinstance(error, JunitError):
  556. if "in Code Coverage" in error.text or "covered" in error.message:
  557. msg = "\n".join(filter(None, [error.message, error.text]))
  558. # Don't bother looking up known issues for code coverage report, just print it verbatim as an info message
  559. known_errors.append(
  560. FailureInCoverageRun(
  561. error_type="Failure",
  562. internal_error_type="FAILURE_IN_COVERAGE_MODE",
  563. error_message=msg,
  564. location=error.testcase,
  565. )
  566. )
  567. else:
  568. # JUnit error
  569. all_error_details_raw = error.text
  570. all_error_detail_parts = all_error_details_raw.split(
  571. JUNIT_ERROR_DETAILS_SEPARATOR
  572. )
  573. error_details = all_error_detail_parts[0]
  574. if len(all_error_detail_parts) == 3:
  575. additional_collapsed_error_details_header = (
  576. all_error_detail_parts[1]
  577. )
  578. additional_collapsed_error_details = all_error_detail_parts[2]
  579. elif len(all_error_detail_parts) == 1:
  580. additional_collapsed_error_details_header = None
  581. additional_collapsed_error_details = None
  582. else:
  583. raise RuntimeError(
  584. f"Unexpected error details format: {all_error_details_raw}"
  585. )
  586. handle_error(
  587. error_message=error.message,
  588. error_details=error_details,
  589. location=error.testcase,
  590. location_url=None,
  591. additional_collapsed_error_details_header=additional_collapsed_error_details_header,
  592. additional_collapsed_error_details=additional_collapsed_error_details,
  593. )
  594. elif isinstance(error, Secret):
  595. for artifact in artifacts:
  596. if artifact["job_id"] == job and artifact["path"] == error.file:
  597. location: str = error.file
  598. location_url = get_artifact_url(artifact)
  599. break
  600. else:
  601. location: str = error.file
  602. location_url = None
  603. handle_error(
  604. f"Secret found on line {error.line}: {error.secret}",
  605. f"Detector: {error.detector_name}. Don't print out secrets in tests/logs and revoke them immediately. Mark false positives in misc/shlib/shlib.bash's trufflehog_jq_filter_(logs|common)",
  606. location,
  607. location_url,
  608. )
  609. else:
  610. raise RuntimeError(f"Unexpected error type: {type(error)}")
  611. ignore_failure = bool(known_errors) and not unknown_errors
  612. if not unknown_errors:
  613. for error in known_errors:
  614. if not isinstance(error, ObservedErrorWithIssue):
  615. ignore_failure = False
  616. break
  617. elif error.issue_ignore_failure == False:
  618. ignore_failure = False
  619. break
  620. build_history_on_main = get_failures_on_main(test_analytics)
  621. try:
  622. annotate_errors(
  623. unknown_errors,
  624. known_errors,
  625. build_history_on_main,
  626. test_analytics,
  627. test_cmd,
  628. test_desc,
  629. test_result,
  630. ignore_failure,
  631. )
  632. except Exception as e:
  633. print(f"Annotating failed, continuing: {e}")
  634. # No need for rest of the logic as no error logs were found, but since
  635. # this script was called the test still failed, so showing the current
  636. # failures on main branch.
  637. # Only fetch the main branch status when we are running in CI, but no on
  638. # main, so inside of a PR or a release branch instead.
  639. if (
  640. len(unknown_errors) == 0
  641. and len(known_errors) == 0
  642. and ui.env_is_truthy("BUILDKITE")
  643. and os.getenv("BUILDKITE_BRANCH") != "main"
  644. and test_result != 0
  645. and get_job_state() not in ("canceling", "canceled")
  646. ):
  647. annotation = Annotation(
  648. suite_name=get_suite_name(),
  649. buildkite_job_id=os.getenv("BUILDKITE_JOB_ID", ""),
  650. is_failure=True,
  651. build_history_on_main=build_history_on_main,
  652. unknown_errors=[],
  653. known_errors=[],
  654. test_cmd=test_cmd,
  655. test_desc=test_desc,
  656. ignore_failure=False,
  657. )
  658. add_annotation_raw(style="error", markdown=annotation.to_markdown())
  659. store_annotation_in_test_analytics(test_analytics, annotation)
  660. store_known_issues_in_test_analytics(test_analytics, known_issues)
  661. return (len(unknown_errors), ignore_failure)
  662. def get_errors(log_file_names: list[str]) -> list[ErrorLog | JunitError | Secret]:
  663. error_logs = []
  664. for log_file_name in log_file_names:
  665. if "junit_" in log_file_name:
  666. error_logs.extend(_get_errors_from_junit_file(log_file_name))
  667. elif log_file_name == "trufflehog.log":
  668. error_logs.extend(_get_errors_from_trufflehog(log_file_name))
  669. else:
  670. error_logs.extend(_get_errors_from_log_file(log_file_name))
  671. return error_logs
  672. def _get_errors_from_junit_file(log_file_name: str) -> list[JunitError]:
  673. error_logs = []
  674. try:
  675. xml = JUnitXml.fromfile(log_file_name)
  676. except ParseError as e:
  677. # Ignore empty files
  678. if "no element found: line 1, column 0" in str(e):
  679. return error_logs
  680. else:
  681. raise
  682. for suite in xml:
  683. for testcase in suite:
  684. for result in testcase.result:
  685. if not isinstance(result, Error) and not isinstance(result, Failure):
  686. continue
  687. error_logs.append(
  688. JunitError(
  689. testcase.classname,
  690. testcase.name,
  691. (result.message or "").replace("&#10;", "\n"),
  692. result.text or "",
  693. )
  694. )
  695. return error_logs
  696. def _get_errors_from_trufflehog(log_file_name: str) -> list[Secret]:
  697. error_logs = []
  698. with open(log_file_name) as f:
  699. for line in f:
  700. data = json.loads(line)
  701. # Only report each secret once
  702. for error_log in error_logs:
  703. if error_log.secret == data["Raw"]:
  704. break
  705. else:
  706. error_logs.append(
  707. Secret(
  708. data["Raw"],
  709. data["SourceMetadata"]["Data"]["Filesystem"]["file"],
  710. data["SourceMetadata"]["Data"]["Filesystem"]["line"],
  711. data["DetectorName"],
  712. )
  713. )
  714. return error_logs
  715. def _get_errors_from_log_file(log_file_name: str) -> list[ErrorLog]:
  716. error_logs = []
  717. with open(log_file_name, "r+") as f:
  718. try:
  719. data: Any = mmap.mmap(f.fileno(), 0)
  720. except ValueError:
  721. # empty file, ignore
  722. return error_logs
  723. error_logs.extend(_collect_errors_in_logs(data, log_file_name))
  724. data.seek(0)
  725. error_logs.extend(_collect_service_panics_in_logs(data, log_file_name))
  726. return error_logs
  727. def _collect_errors_in_logs(data: Any, log_file_name: str) -> list[ErrorLog]:
  728. collected_errors = []
  729. for match in ERROR_RE.finditer(data):
  730. error = match.group(0)
  731. if IGNORE_RE.search(error):
  732. continue
  733. label = os.getenv("BUILDKITE_LABEL")
  734. if (
  735. label
  736. and label.startswith("Product limits (finding new limits) ")
  737. and PRODUCT_LIMITS_FIND_IGNORE_RE.search(error)
  738. ):
  739. continue
  740. # environmentd segfaults during normal shutdown in coverage builds, see database-issues#5980
  741. # Ignoring this in regular ways would still be quite spammy.
  742. if (
  743. b"environmentd" in error
  744. and b"segfault at" in error
  745. and ui.env_is_truthy("CI_COVERAGE_ENABLED")
  746. ):
  747. continue
  748. if log_file_name == "journalctl-merge.log":
  749. if journalctl_match := JOURNALCTL_LOG_LINE_RE.match(error):
  750. error = journalctl_match.group("msg")
  751. collected_errors.append(ErrorLog(error, log_file_name))
  752. return collected_errors
  753. def _collect_service_panics_in_logs(data: Any, log_file_name: str) -> list[ErrorLog]:
  754. collected_panics = []
  755. open_panics = {}
  756. for line in iter(data.readline, b""):
  757. # Don't try to match regexes on HUGE lines, since it can take too long
  758. line = line.rstrip(b"\n")[:8192]
  759. if match := PANIC_IN_SERVICE_START_RE.match(line):
  760. service = match.group("service")
  761. assert (
  762. service not in open_panics
  763. ), f"Two panics of same service {service} interleaving: {line}"
  764. open_panics[service] = line
  765. elif open_panics:
  766. if match := SERVICES_LOG_LINE_RE.match(line):
  767. # Handling every services.log line here, filter to
  768. # handle only the ones which are currently in a panic
  769. # handler:
  770. if panic_start := open_panics.get(match.group("service")):
  771. panic_without_ts = TIMESTAMP_IN_PANIC_RE.sub(b"", panic_start)
  772. del open_panics[match.group("service")]
  773. if IGNORE_RE.search(match.group(0)):
  774. continue
  775. collected_panics.append(
  776. ErrorLog(
  777. panic_without_ts + b" " + match.group("msg"), log_file_name
  778. )
  779. )
  780. assert not open_panics, f"Panic log never finished: {open_panics}"
  781. return collected_panics
  782. def crop_text(text: str | None, max_length: int = 10_000) -> str:
  783. if text is None:
  784. return ""
  785. if len(text) > max_length:
  786. text = text[:max_length] + " [...]"
  787. return text
  788. def sanitize_text(text: str, max_length: int = 4_000) -> str:
  789. text = crop_text(text, max_length)
  790. text = text.replace("```", r"\`\`\`")
  791. return text
  792. def get_failures_on_main(test_analytics: TestAnalyticsDb) -> BuildHistory:
  793. pipeline_slug = os.getenv("BUILDKITE_PIPELINE_SLUG")
  794. step_key = os.getenv("BUILDKITE_STEP_KEY")
  795. parallel_job = os.getenv("BUILDKITE_PARALLEL_JOB")
  796. assert pipeline_slug is not None
  797. assert step_key is not None
  798. if parallel_job is not None:
  799. parallel_job = int(parallel_job)
  800. try:
  801. build_history = (
  802. test_analytics.build_history.get_recent_build_job_failures_on_main(
  803. pipeline=pipeline_slug,
  804. step_key=step_key,
  805. parallel_job_index=parallel_job,
  806. )
  807. )
  808. if len(build_history.last_build_step_outcomes) < 5:
  809. print(
  810. f"Loading build history from test analytics did not provide enough data ({len(build_history.last_build_step_outcomes)} entries)"
  811. )
  812. else:
  813. return build_history
  814. except Exception as e:
  815. test_analytics.on_data_retrieval_failed(e)
  816. print("Loading build history from buildkite instead")
  817. return _get_failures_on_main_from_buildkite(
  818. pipeline_slug=pipeline_slug, step_key=step_key, parallel_job=parallel_job
  819. )
  820. def _get_failures_on_main_from_buildkite(
  821. pipeline_slug: str, step_key: str, parallel_job: int | None
  822. ) -> BuildHistory:
  823. step_name = os.getenv("BUILDKITE_LABEL") or step_key
  824. current_build_number = os.getenv("BUILDKITE_BUILD_NUMBER")
  825. assert step_name is not None
  826. assert current_build_number is not None
  827. # This is only supposed to be invoked when the build step failed.
  828. builds_data = builds_api.get_builds(
  829. pipeline_slug=pipeline_slug,
  830. max_fetches=1,
  831. branch="main",
  832. # also include builds that are still running (the relevant build step may already have completed)
  833. build_states=["running", "passed", "failing", "failed"],
  834. # assume and account that at most one build is still running
  835. items_per_page=5 + 1,
  836. )
  837. no_entries_result = BuildHistory(
  838. pipeline=pipeline_slug, branch="main", last_build_step_outcomes=[]
  839. )
  840. if not builds_data:
  841. print(f"Got no finished builds of pipeline {pipeline_slug}")
  842. return no_entries_result
  843. else:
  844. print(f"Fetched {len(builds_data)} builds of pipeline {pipeline_slug}")
  845. build_step_matcher = BuildStepMatcher(step_key, parallel_job)
  846. last_build_step_outcomes = extract_build_step_outcomes(
  847. builds_data,
  848. selected_build_steps=[build_step_matcher],
  849. build_step_states=BUILDKITE_RELEVANT_COMPLETED_BUILD_STEP_STATES,
  850. )
  851. # remove the current build
  852. last_build_step_outcomes = [
  853. outcome
  854. for outcome in last_build_step_outcomes
  855. if outcome.build_number != current_build_number
  856. ]
  857. if len(last_build_step_outcomes) > 8:
  858. # the number of build steps might be higher than the number of requested builds due to retries
  859. last_build_step_outcomes = last_build_step_outcomes[:8]
  860. if not last_build_step_outcomes:
  861. print(
  862. f"The {len(builds_data)} last fetched builds do not contain a completed build step matching {build_step_matcher}"
  863. )
  864. return no_entries_result
  865. return BuildHistory(
  866. pipeline=pipeline_slug,
  867. branch="main",
  868. last_build_step_outcomes=[
  869. BuildHistoryEntry(entry.web_url_to_job, entry.passed)
  870. for entry in last_build_step_outcomes
  871. ],
  872. )
  873. def get_job_state() -> str:
  874. pipeline_slug = os.getenv("BUILDKITE_PIPELINE_SLUG")
  875. build_number = os.getenv("BUILDKITE_BUILD_NUMBER")
  876. job_id = os.getenv("BUILDKITE_JOB_ID")
  877. url = f"organizations/materialize/pipelines/{pipeline_slug}/builds/{build_number}"
  878. build = generic_api.get(url, {})
  879. for job in build["jobs"]:
  880. if job["id"] == job_id:
  881. return job["state"]
  882. raise ValueError("Job not found")
  883. def get_suite_name(include_retry_info: bool = True) -> str:
  884. suite_name = os.getenv("BUILDKITE_LABEL", "Unknown Test")
  885. if include_retry_info:
  886. retry_count = get_retry_count()
  887. if retry_count > 0:
  888. suite_name += f" (#{retry_count + 1})"
  889. return suite_name
  890. def get_retry_count() -> int:
  891. return int(os.getenv("BUILDKITE_RETRY_COUNT", "0"))
  892. def format_message_as_code_block(
  893. error_message: str | None, max_length: int = 10_000
  894. ) -> str:
  895. if not error_message:
  896. return ""
  897. # Don't have too huge output, so truncate
  898. return f"```\n{sanitize_text(error_message, max_length)}\n```"
  899. def store_known_issues_in_test_analytics(
  900. test_analytics: TestAnalyticsDb, known_issues: list[KnownGitHubIssue]
  901. ) -> None:
  902. if os.getenv("BUILDKITE_PIPELINE_SLUG") == "test":
  903. # Too slow, run only on slower pipelines. We don't need the updates to be immediate anyway.
  904. return
  905. for issue in known_issues:
  906. test_analytics.known_issues.add_or_update_issue(issue)
  907. def store_annotation_in_test_analytics(
  908. test_analytics: TestAnalyticsDb, annotation: Annotation
  909. ) -> None:
  910. # the build step was already inserted before
  911. # the buildkite status may have been successful but the build may still fail due to unknown errors in the log
  912. test_analytics.builds.update_build_job_success(
  913. was_successful=not annotation.is_failure
  914. )
  915. error_entries = [
  916. AnnotationErrorEntry(
  917. error_type=error.internal_error_type,
  918. message=error.to_text(),
  919. issue=(
  920. f"database-issues/{error.issue_number}"
  921. if isinstance(error, WithIssue)
  922. else None
  923. ),
  924. occurrence_count=error.occurrences,
  925. )
  926. for error in chain(annotation.known_errors, annotation.unknown_errors)
  927. ]
  928. test_analytics.build_annotations.add_annotation(
  929. build_annotation_storage.AnnotationEntry(
  930. test_suite=get_suite_name(include_retry_info=False),
  931. test_retry_count=get_retry_count(),
  932. is_failure=annotation.is_failure,
  933. errors=error_entries,
  934. ),
  935. )
  936. if __name__ == "__main__":
  937. sys.exit(main())