ci_closed_issues_detect.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. #
  10. # ci_closed_issues_detect.py - Detect references to already closed issues.
  11. import argparse
  12. import os
  13. import re
  14. import sys
  15. from collections.abc import Iterator
  16. from dataclasses import dataclass
  17. from typing import IO
  18. import requests
  19. from materialize import buildkite, spawn
  20. ISSUE_RE = re.compile(
  21. r"""
  22. ( TimelyDataflow/timely-dataflow\#(?P<timelydataflow>[0-9]+)
  23. | ( materialize\# | materialize/issues/ ) (?P<materialize>[0-9]+)
  24. | ( cloud\# | cloud/issues/ ) (?P<cloud>[0-9]+)
  25. | ( incidents-and-escalations\# | incidents-and-escalations/issues/ ) (?P<incidentsandescalations>[0-9]+)
  26. | ( database-issues\# | database-issues/issues/ ) (?P<databaseissues>[0-9]+)
  27. # only match from the beginning of the line or after a space character to avoid matching Buildkite URLs
  28. | (^|\s) \# (?P<ambiguous>[0-9]+)
  29. )
  30. """,
  31. re.VERBOSE,
  32. )
  33. GROUP_REPO = {
  34. "timelydataflow": "TimelyDataflow/timely-dataflow",
  35. "materialize": "MaterializeInc/materialize",
  36. "cloud": "MaterializeInc/cloud",
  37. "incidentsandescalations": "MaterializeInc/incidents-and-escalations",
  38. "databaseissues": "MaterializeInc/database-issues",
  39. "ambiguous": None,
  40. }
  41. REFERENCE_RE = re.compile(
  42. r"""
  43. ( reenable
  44. | re-enable
  45. | reconsider
  46. | TODO
  47. # Used in Buildkite pipeline config files
  48. | skip:
  49. # Used in platform-checks
  50. | @disabled
  51. # Used in pytest
  52. | @pytest.mark.skip
  53. # Used in output-consistency framework
  54. | YesIgnore
  55. | tracked\ with
  56. # Used in proto files
  57. | //\ buf\ breaking:\ ignore
  58. # Used in documentation
  59. | in\ the\ future
  60. )
  61. """,
  62. re.VERBOSE | re.IGNORECASE,
  63. )
  64. IGNORE_RE = re.compile(
  65. r"""
  66. ( discussion\ of\ this\ in
  67. | discussed\ in
  68. | [sS]ee\ \<
  69. # is_null_propagation.slt
  70. | isnull\(\#0\)
  71. # src/transform/tests/test_transforms/column_knowledge.spec
  72. | \(\#1\)\ IS\ NULL
  73. # test/sqllogictest/cockroach/*.slt
  74. | cockroach\#
  75. | Liquibase
  76. # ci/test/lint-buf/README.md
  77. | Ignore\ because\ of\ database-issues#99999
  78. # src/storage-client/src/controller.rs
  79. | issues/20211\>
  80. # src/sql/src/plan/statement.rs
  81. | issues/20019\>
  82. # src/storage/src/storage_state.rs
  83. | \#19907$
  84. )
  85. """,
  86. re.VERBOSE | re.IGNORECASE,
  87. )
  88. COMMENT_RE = re.compile(r"#|//")
  89. IGNORE_FILENAME_RE = re.compile(
  90. r"""
  91. ( .*\.(svg|png|jpg|jpeg|avro|ico)
  92. | doc/developer/design/20230223_stabilize_with_mutually_recursive.md
  93. )
  94. """,
  95. re.VERBOSE,
  96. )
  97. FILENAME_REFERENCE_RE = re.compile(r".*\.(td|slt|test)\.gh(?P<ambiguous>[0-9]+)")
  98. @dataclass
  99. class IssueRef:
  100. repository: str | None
  101. issue_id: int
  102. filename: str
  103. line_number: int
  104. text: str | None
  105. @dataclass
  106. class CommentBlock:
  107. char: str
  108. pos: int
  109. text: str
  110. line_number: int
  111. def comment_blocks(file: IO) -> Iterator[tuple[int, str]]:
  112. comment: CommentBlock | None = None
  113. for line_number, line in enumerate(file):
  114. if comment_match := COMMENT_RE.search(line):
  115. char = comment_match.group(0)
  116. pos = comment_match.span()[0]
  117. if comment is None:
  118. comment = CommentBlock(char, pos, line, line_number + 1)
  119. continue
  120. if char == comment.char and pos == comment.pos:
  121. comment.text += line
  122. continue
  123. yield (comment.line_number, comment.text)
  124. comment = CommentBlock(char, pos, line, line_number + 1)
  125. continue
  126. if comment is not None:
  127. yield (comment.line_number, comment.text)
  128. comment = None
  129. yield (line_number + 1, line)
  130. if comment is not None:
  131. yield (comment.line_number, comment.text)
  132. def detect_referenced_issues(filename: str) -> list[IssueRef]:
  133. issue_refs: list[IssueRef] = []
  134. with open(filename) as file:
  135. for line_number, text in comment_blocks(file):
  136. if not REFERENCE_RE.search(text) or IGNORE_RE.search(text):
  137. continue
  138. offset = 0
  139. while issue_match := ISSUE_RE.search(text, offset):
  140. offset = issue_match.span()[1]
  141. groups = [
  142. (key, value)
  143. for key, value in issue_match.groupdict().items()
  144. if value
  145. ]
  146. assert len(groups) == 1, f"Expected only 1 element in {groups}"
  147. group, issue_id = groups[0]
  148. is_referenced_with_url = "issues/" in issue_match.group(0)
  149. # Explain plans can look like issue references
  150. if (
  151. group == "ambiguous"
  152. and int(issue_id) < 100
  153. and not is_referenced_with_url
  154. ):
  155. continue
  156. issue_refs.append(
  157. IssueRef(
  158. GROUP_REPO[group],
  159. int(issue_id),
  160. filename,
  161. line_number,
  162. text.strip(),
  163. )
  164. )
  165. return issue_refs
  166. def is_issue_closed_on_github(repository: str | None, issue_id: int) -> bool:
  167. assert repository
  168. headers = {
  169. "Accept": "application/vnd.github+json",
  170. "X-GitHub-Api-Version": "2022-11-28",
  171. }
  172. if token := os.getenv("GITHUB_CI_ISSUE_REFERENCE_CHECKER_TOKEN") or os.getenv(
  173. "GITHUB_TOKEN"
  174. ):
  175. headers["Authorization"] = f"Bearer {token}"
  176. url = f"https://api.github.com/repos/{repository}/issues/{issue_id}"
  177. response = requests.get(url, headers=headers)
  178. if response.status_code == 404 and not os.getenv("CI"):
  179. print(
  180. f"Can't check issue #{issue_id} in {repository} repo, set GITHUB_TOKEN environment variable or run this check in CI"
  181. )
  182. return False
  183. if response.status_code != 200:
  184. raise ValueError(
  185. f"Bad return code from GitHub on {url}: {response.status_code}: {response.text}"
  186. )
  187. issue_json = response.json()
  188. # We can't check the issue number anymore because issues can have moved
  189. return issue_json["state"] == "closed"
  190. def filter_changed_lines(issue_refs: list[IssueRef]) -> list[IssueRef]:
  191. changed_lines = buildkite.find_modified_lines()
  192. return [
  193. issue_ref
  194. for issue_ref in issue_refs
  195. if issue_ref.text is not None
  196. and any(
  197. (issue_ref.filename, issue_ref.line_number + i) in changed_lines
  198. for i in range(issue_ref.text.count("\n"))
  199. )
  200. ]
  201. def filter_ambiguous_issues(
  202. issue_refs: list[IssueRef],
  203. ) -> tuple[list[IssueRef], list[IssueRef]]:
  204. return [issue_ref for issue_ref in issue_refs if issue_ref.repository], [
  205. issue_ref for issue_ref in issue_refs if not issue_ref.repository
  206. ]
  207. def filter_closed_issues(issue_refs: list[IssueRef]) -> list[IssueRef]:
  208. issues = {(issue_ref.repository, issue_ref.issue_id) for issue_ref in issue_refs}
  209. closed_issues = {
  210. (repository, issue)
  211. for repository, issue in issues
  212. if is_issue_closed_on_github(repository, issue)
  213. }
  214. return [
  215. issue_ref
  216. for issue_ref in issue_refs
  217. if (issue_ref.repository, issue_ref.issue_id) in closed_issues
  218. ]
  219. def main() -> int:
  220. parser = argparse.ArgumentParser(
  221. prog="ci-closed-issues-detect",
  222. formatter_class=argparse.RawDescriptionHelpFormatter,
  223. description="ci-closed-issues-detect detects references to already closed GitHub issues.",
  224. )
  225. parser.add_argument(
  226. "--changed-lines-only",
  227. action="store_true",
  228. help="only report issues in changed files/lines",
  229. )
  230. args = parser.parse_args()
  231. filenames = spawn.capture(
  232. ["git", "ls-tree", "--full-tree", "-r", "--name-only", "HEAD"]
  233. )
  234. issue_refs: list[IssueRef] = []
  235. for filename in filenames.splitlines():
  236. if issue_match := FILENAME_REFERENCE_RE.search(filename):
  237. groups = [
  238. (key, value) for key, value in issue_match.groupdict().items() if value
  239. ]
  240. assert len(groups) == 1, f"Expected only 1 element in {groups}"
  241. group, issue_id = groups[0]
  242. issue_refs.append(
  243. IssueRef(
  244. GROUP_REPO[group],
  245. int(issue_id),
  246. filename,
  247. 0,
  248. None,
  249. )
  250. )
  251. # Files without any ending can be interesting datadriven test files
  252. if (
  253. not IGNORE_FILENAME_RE.match(filename)
  254. and not os.path.isdir(filename)
  255. and not os.path.islink(filename)
  256. ):
  257. issue_refs.extend(detect_referenced_issues(filename))
  258. issue_refs, ambiguous_refs = filter_ambiguous_issues(issue_refs)
  259. if args.changed_lines_only:
  260. issue_refs = filter_changed_lines(issue_refs)
  261. ambiguous_refs = filter_changed_lines(ambiguous_refs)
  262. issue_refs = filter_closed_issues(issue_refs)
  263. for issue_ref in ambiguous_refs:
  264. print(f"--- Ambiguous issue reference: #{issue_ref.issue_id}")
  265. if issue_ref.text is not None:
  266. print(f"{issue_ref.filename}:{issue_ref.line_number}:")
  267. print(issue_ref.text)
  268. else:
  269. print(f"{issue_ref.filename} (filename)")
  270. print(
  271. f"Use database-issues#{issue_ref.issue_id} or materialize#{issue_ref.issue_id} instead to have an unambiguous reference"
  272. )
  273. for issue_ref in issue_refs:
  274. url = buildkite.inline_link(
  275. f"https://github.com/{issue_ref.repository}/issues/{issue_ref.issue_id}",
  276. f"{issue_ref.repository}#{issue_ref.issue_id}",
  277. )
  278. print(f"--- Issue is referenced in comment but already closed: {url}")
  279. if issue_ref.text is not None:
  280. print(f"{issue_ref.filename}:{issue_ref.line_number}:")
  281. print(issue_ref.text)
  282. else:
  283. print(f"{issue_ref.filename} (filename)")
  284. return 1 if issue_refs + ambiguous_refs else 0
  285. if __name__ == "__main__":
  286. sys.exit(main())