pg_result_comparator.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. import math
  10. import re
  11. from decimal import Decimal
  12. from functools import partial
  13. from typing import Any
  14. from materialize.output_consistency.expression.expression import Expression
  15. from materialize.output_consistency.ignore_filter.expression_matchers import (
  16. is_operation_tagged,
  17. is_table_function,
  18. )
  19. from materialize.output_consistency.ignore_filter.inconsistency_ignore_filter import (
  20. GenericInconsistencyIgnoreFilter,
  21. )
  22. from materialize.output_consistency.input_data.operations.jsonb_operations_provider import (
  23. TAG_JSONB_OBJECT_GENERATION,
  24. )
  25. from materialize.output_consistency.query.query_result import QueryExecution
  26. from materialize.output_consistency.validation.error_message_normalizer import (
  27. ErrorMessageNormalizer,
  28. )
  29. from materialize.output_consistency.validation.result_comparator import ResultComparator
  30. # Examples:
  31. # * 2038-01-19 03:14:18
  32. # * 2038-01-19 03:14:18.123
  33. # * 2038-01-19 03:14:18.123+00
  34. # * 2038-01-19 03:14:18.123+00 BC
  35. # * 2038-01-19 03:14:18+00
  36. # * 2038-01-19 03:14:18-03:00
  37. # * 2038-01-19T03:14:18+00 (when used in JSONB)
  38. TIMESTAMP_PATTERN = re.compile(
  39. r"^\d{4,}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(\.\d+)?([+-]\d+(:\d+)?)?( (BC|AC))?$"
  40. )
  41. # Examples:
  42. # * NaN
  43. # * 1
  44. # * -1.23
  45. # * 1.23e-3
  46. # * 1.23e+3
  47. DECIMAL_PATTERN = re.compile(r"^NaN|[+-]?\d+(\.\d+)?(e[+-]?\d+)?$")
  48. # Examples:
  49. # * ["1","2"]
  50. # * [1,2]
  51. # * [1, 2]
  52. # * [1, [1, 2]]
  53. SIMPLIFIED_ARRAY_PATTERN = re.compile(r"\[(.*),(.*)\]")
  54. # Examples:
  55. # * {"a": 1, "c": 3}
  56. JSON_OBJECT_PATTERN = re.compile(r"\{(.*)[:,](.*)\}")
  57. class PostgresResultComparator(ResultComparator):
  58. """Compares the outcome (result or failure) of multiple query executions"""
  59. def __init__(
  60. self,
  61. ignore_filter: GenericInconsistencyIgnoreFilter,
  62. error_message_normalizer: ErrorMessageNormalizer,
  63. ):
  64. super().__init__(ignore_filter, error_message_normalizer)
  65. self.floating_precision = 1e-03
  66. def shall_validate_error_message(self, query_execution: QueryExecution) -> bool:
  67. # do not compare error messages at all
  68. return False
  69. def is_value_equal(
  70. self,
  71. value1: Any,
  72. value2: Any,
  73. expression: Expression,
  74. is_tolerant: bool = False,
  75. ) -> bool:
  76. if super().is_value_equal(value1, value2, expression, is_tolerant=is_tolerant):
  77. return True
  78. if isinstance(value1, Decimal):
  79. if isinstance(value2, Decimal):
  80. return self.is_decimal_equal(value1, value2)
  81. if isinstance(value2, float):
  82. return self.is_decimal_equal(value1, Decimal(value2))
  83. if isinstance(value1, float):
  84. if isinstance(value2, float):
  85. return self.is_float_equal(value1, value2)
  86. if isinstance(value2, Decimal):
  87. return self.is_decimal_equal(Decimal(value1), value2)
  88. if isinstance(value1, str) and isinstance(value2, str):
  89. return self.is_str_equal(value1, value2, is_tolerant)
  90. if is_tolerant:
  91. type1 = type(value1)
  92. type2 = type(value2)
  93. if type1 in {int, float, Decimal} and type2 in {int, float, Decimal}:
  94. # This is needed for imprecise results of floating type operations that are returned as int or float
  95. # values in dicts of JSONB data.
  96. return self.is_decimal_equal(Decimal(value1), Decimal(value2))
  97. return False
  98. def is_decimal_equal(self, value1: Decimal, value2: Decimal) -> bool:
  99. if value1.is_nan():
  100. return value2.is_nan()
  101. return math.isclose(value1, value2, rel_tol=self.floating_precision)
  102. def is_float_equal(self, value1: float, value2: float) -> bool:
  103. if math.isnan(value1):
  104. return math.isnan(value2)
  105. return math.isclose(value1, value2, rel_tol=self.floating_precision)
  106. def is_str_equal(self, value1: str, value2: str, is_tolerant: bool) -> bool:
  107. if self.is_timestamp(value1) and self.is_timestamp(value2):
  108. return self.is_timestamp_equal(value1, value2)
  109. if (
  110. SIMPLIFIED_ARRAY_PATTERN.search(value1)
  111. or JSON_OBJECT_PATTERN.search(value1)
  112. ) and (
  113. SIMPLIFIED_ARRAY_PATTERN.search(value2)
  114. or JSON_OBJECT_PATTERN.search(value2)
  115. ):
  116. # This is a rather eager pattern to also match concatenated strings.
  117. # tracked with database-issues#7085
  118. value1 = value1.replace(", ", ",").replace(": ", ":")
  119. value2 = value2.replace(", ", ",").replace(": ", ":")
  120. # Postgres uses 'mon' / 'mons' instead of 'month' / 'months'
  121. value1 = value1.replace(" month", " mon")
  122. value2 = value2.replace(" month", " mon")
  123. if is_tolerant and self.is_decimal(value1) and self.is_decimal(value2):
  124. try:
  125. return self.is_decimal_equal(Decimal(value1), Decimal(value2))
  126. except Exception:
  127. return True
  128. return value1 == value2
  129. def is_decimal(self, value: str):
  130. return DECIMAL_PATTERN.match(value) is not None
  131. def is_timestamp(self, value: str) -> bool:
  132. return TIMESTAMP_PATTERN.match(value) is not None
  133. def is_timestamp_equal(self, value1: str, value2: str) -> bool:
  134. # try to match any of these
  135. last_second_and_milliseconds_regex = r"(\d\.\d+)"
  136. last_second_before_timezone_regex = r"(?<=:\d)(\d)(?=\+)"
  137. last_second_at_the_end_regex = r"(?<=:\d)(\d$)"
  138. last_second_and_milliseconds_pattern = re.compile(
  139. f"{last_second_and_milliseconds_regex}|{last_second_before_timezone_regex}|{last_second_at_the_end_regex}"
  140. )
  141. if last_second_and_milliseconds_pattern.search(
  142. value1
  143. ) and last_second_and_milliseconds_pattern.search(value2):
  144. # drop milliseconds and, if present, trunc last digit of second
  145. value1 = last_second_and_milliseconds_pattern.sub("0", value1)
  146. value2 = last_second_and_milliseconds_pattern.sub("0", value2)
  147. value1 = self._normalize_jsonb_timestamp(value1)
  148. value2 = self._normalize_jsonb_timestamp(value2)
  149. assert self.is_timestamp(value1)
  150. assert self.is_timestamp(value2)
  151. return value1 == value2
  152. def _normalize_jsonb_timestamp(self, value: str) -> str:
  153. # this is due to database-issues#8247
  154. pattern_for_date = r"\d+-\d+-\d+"
  155. pattern_for_time = r"\d+:\d+:\d+[+-]\d+"
  156. pattern_for_value_without_t_sep_and_timezone_mins = (
  157. rf"({pattern_for_date})T({pattern_for_time}):\d+"
  158. )
  159. match = re.match(pattern_for_value_without_t_sep_and_timezone_mins, value)
  160. if match is None:
  161. return value
  162. return match.group(1) + " " + match.group(2)
  163. def ignore_row_order(self, expression: Expression) -> bool:
  164. if expression.matches(
  165. is_table_function,
  166. True,
  167. ):
  168. # inconsistent sort order
  169. return True
  170. return False
  171. def ignore_order_when_comparing_collection(self, expression: Expression) -> bool:
  172. if expression.matches(
  173. partial(is_operation_tagged, tag=TAG_JSONB_OBJECT_GENERATION),
  174. True,
  175. ):
  176. # this is because of database-issues#8266
  177. return True
  178. return False