benchmark_result_evaluator.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. from __future__ import annotations
  10. from typing import Generic, TypeVar
  11. from materialize.feature_benchmark.benchmark_result import BenchmarkScenarioMetric
  12. from materialize.feature_benchmark.measurement import MeasurementType
  13. from materialize.feature_benchmark.scenario import Scenario
  14. from materialize.terminal import (
  15. COLOR_BAD,
  16. COLOR_GOOD,
  17. with_conditional_formatting,
  18. )
  19. T = TypeVar("T")
  20. class BenchmarkResultEvaluator(Generic[T]):
  21. def ratio(self, metric: BenchmarkScenarioMetric) -> float | None:
  22. raise RuntimeError
  23. def is_regression(
  24. self, metric: BenchmarkScenarioMetric, threshold: float | None = None
  25. ) -> bool:
  26. raise RuntimeError
  27. def is_strong_regression(self, metric: BenchmarkScenarioMetric) -> bool:
  28. raise RuntimeError
  29. def human_readable(self, metric: BenchmarkScenarioMetric, use_colors: bool) -> str:
  30. raise RuntimeError
  31. class RelativeThresholdEvaluator(BenchmarkResultEvaluator[float | None]):
  32. def __init__(self, scenario_class: type[Scenario]) -> None:
  33. self.threshold_by_measurement_type: dict[MeasurementType, float] = (
  34. scenario_class.RELATIVE_THRESHOLD
  35. )
  36. def get_threshold(self, metric: BenchmarkScenarioMetric) -> float:
  37. return self.threshold_by_measurement_type[metric.measurement_type]
  38. def ratio(self, metric: BenchmarkScenarioMetric) -> float | None:
  39. if metric._points[0] is None or metric._points[1] is None:
  40. return None
  41. else:
  42. return metric._points[0] / metric._points[1]
  43. def is_regression(
  44. self, metric: BenchmarkScenarioMetric, threshold: float | None = None
  45. ) -> bool:
  46. if threshold is None:
  47. threshold = self.get_threshold(metric)
  48. ratio = self.ratio(metric)
  49. if ratio is None:
  50. return False
  51. if ratio > 1:
  52. return ratio - 1 > threshold
  53. else:
  54. return False
  55. def is_strong_regression(self, metric: BenchmarkScenarioMetric) -> bool:
  56. return self.is_regression(
  57. metric,
  58. threshold=self.get_threshold(metric) * 2,
  59. )
  60. def human_readable(self, metric: BenchmarkScenarioMetric, use_colors: bool) -> str:
  61. assert metric.measurement_type.is_lower_value_better(), "unexpected metric"
  62. if metric.measurement_type.is_amount():
  63. improvement = "less"
  64. deterioration = "more"
  65. else:
  66. improvement = "faster"
  67. deterioration = "slower"
  68. ratio = self.ratio(metric)
  69. if ratio is None:
  70. return "not comparable"
  71. if ratio >= 2:
  72. return with_conditional_formatting(
  73. f"worse: {ratio:4.1f} TIMES {deterioration}",
  74. COLOR_BAD,
  75. condition=use_colors,
  76. )
  77. elif ratio > 1:
  78. return with_conditional_formatting(
  79. f"worse: {-(1-ratio)*100:4.1f}% {deterioration}",
  80. COLOR_BAD,
  81. condition=use_colors,
  82. )
  83. elif ratio == 1:
  84. return "the same"
  85. elif ratio > 0.5:
  86. return with_conditional_formatting(
  87. f"better: {(1-ratio)*100:4.1f}% {improvement}",
  88. COLOR_GOOD,
  89. condition=use_colors,
  90. )
  91. else:
  92. return with_conditional_formatting(
  93. f"better: {(1/ratio):4.1f} times {improvement}",
  94. COLOR_GOOD,
  95. condition=use_colors,
  96. )