plot.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # Copyright Materialize, Inc. and contributors. All rights reserved.
  2. #
  3. # Use of this software is governed by the Business Source License
  4. # included in the LICENSE file at the root of this repository.
  5. #
  6. # As of the Change Date specified in that file, in accordance with
  7. # the Business Source License, use of this software will be governed
  8. # by the Apache License, Version 2.0.
  9. import math
  10. from enum import Enum
  11. from typing import Any
  12. import numpy as np
  13. from matplotlib.axes import Axes
  14. from matplotlib.figure import SubFigure
  15. from matplotlib.markers import MarkerStyle
  16. from materialize.scalability.df.df_details import DfDetails
  17. from materialize.scalability.df.df_totals import DfTotals
  18. from materialize.scalability.endpoint.endpoints import endpoint_name_to_description
  19. PLOT_MARKER_POINT = MarkerStyle("o")
  20. PLOT_MARKER_SQUARE = MarkerStyle(",")
  21. PLOT_MARKER_HLINE = MarkerStyle("_")
  22. PLOT_COLOR_DARK_BLUE = "darkblue"
  23. class DistributionPlotType(Enum):
  24. VIOLIN = 1
  25. BOX = 2
  26. DEFAULT_DISTRIBUTION_PLOT_TYPE = DistributionPlotType.VIOLIN
  27. def plot_tps_per_connections(
  28. workload_name: str,
  29. figure: SubFigure,
  30. df_totals_by_endpoint_name: dict[str, DfTotals],
  31. baseline_version_name: str | None,
  32. include_zero_in_y_axis: bool,
  33. include_workload_in_title: bool = False,
  34. ) -> None:
  35. """This uses a scatter plot to plot the TPS per connections."""
  36. legend = []
  37. plot: Axes = figure.subplots(1, 1)
  38. max_concurrency = 1
  39. for endpoint_version_name, df_totals in df_totals_by_endpoint_name.items():
  40. legend.append(endpoint_name_to_description(endpoint_version_name))
  41. plot.scatter(
  42. df_totals.get_concurrency_values(),
  43. df_totals.get_tps_values(),
  44. label="tps",
  45. marker=_get_plot_marker(endpoint_version_name, baseline_version_name),
  46. )
  47. max_concurrency = max(max_concurrency, df_totals.get_max_concurrency())
  48. plot.set_ylabel("Transactions Per Second (tps)")
  49. plot.set_xlabel("Concurrent SQL Connections")
  50. if include_zero_in_y_axis:
  51. plot.set_ylim(ymin=0)
  52. if include_workload_in_title:
  53. plot.set_title(workload_name)
  54. plot.legend(legend)
  55. def plot_duration_by_connections_for_workload(
  56. workload_name: str,
  57. figure: SubFigure,
  58. df_details_by_endpoint_name: dict[str, DfDetails],
  59. include_zero_in_y_axis: bool,
  60. include_workload_in_title: bool = False,
  61. plot_type: DistributionPlotType = DEFAULT_DISTRIBUTION_PLOT_TYPE,
  62. ) -> None:
  63. """This uses a boxplot or violin plot for the distribution of the duration."""
  64. if len(df_details_by_endpoint_name) == 0:
  65. return
  66. concurrencies = next(
  67. iter(df_details_by_endpoint_name.values())
  68. ).get_unique_concurrency_values()
  69. endpoint_version_names = df_details_by_endpoint_name.keys()
  70. use_short_names = len(endpoint_version_names) > 2
  71. num_rows, num_cols = _compute_plot_grid(len(concurrencies), 3)
  72. subplots = figure.subplots(num_rows, num_cols, sharey=False)
  73. for concurrency_index, concurrency in enumerate(concurrencies):
  74. plot, is_in_first_column, is_in_last_row = _get_subplot_in_grid(
  75. subplots, concurrency_index, num_rows, num_cols
  76. )
  77. legend = []
  78. durations: list[list[float]] = []
  79. for endpoint_version_name, df_details in df_details_by_endpoint_name.items():
  80. df_details_of_concurrency = df_details.to_filtered_by_concurrency(
  81. concurrency
  82. )
  83. if not df_details_of_concurrency.has_values():
  84. continue
  85. durations.append(df_details_of_concurrency.get_wallclock_values())
  86. formatted_endpoint_name = (
  87. endpoint_version_name
  88. if not use_short_names
  89. else _shorten_endpoint_version_name(endpoint_version_name)
  90. )
  91. legend.append(formatted_endpoint_name)
  92. _plot_distribution(plot, data=durations, labels=legend, plot_type=plot_type)
  93. if is_in_first_column:
  94. plot.set_ylabel("Duration (seconds)")
  95. if include_zero_in_y_axis:
  96. plot.set_ylim(ymin=0)
  97. title = f"{concurrency} connections"
  98. if include_workload_in_title:
  99. title = f"{workload_name}, {title}"
  100. plot.set_title(title)
  101. def plot_duration_by_endpoints_for_workload(
  102. workload_name: str,
  103. figure: SubFigure,
  104. df_details_by_endpoint_name: dict[str, DfDetails],
  105. include_zero_in_y_axis: bool,
  106. include_workload_in_title: bool = False,
  107. plot_type: DistributionPlotType = DEFAULT_DISTRIBUTION_PLOT_TYPE,
  108. ) -> None:
  109. """This uses a boxplot or violin plot for the distribution of the duration."""
  110. if len(df_details_by_endpoint_name) == 0:
  111. return
  112. num_rows, num_cols = _compute_plot_grid(len(df_details_by_endpoint_name.keys()), 1)
  113. subplots = figure.subplots(num_rows, num_cols, sharey=False)
  114. for endpoint_index, (endpoint_version_name, df_details) in enumerate(
  115. df_details_by_endpoint_name.items()
  116. ):
  117. plot, is_in_first_column, is_in_last_row = _get_subplot_in_grid(
  118. subplots, endpoint_index, num_rows, num_cols
  119. )
  120. concurrencies = df_details.get_unique_concurrency_values()
  121. legend = []
  122. durations: list[list[float]] = []
  123. for concurrency in concurrencies:
  124. df_details_of_concurrency = df_details.to_filtered_by_concurrency(
  125. concurrency
  126. )
  127. if not df_details_of_concurrency.has_values():
  128. continue
  129. durations.append(df_details_of_concurrency.get_wallclock_values())
  130. legend.append(concurrency)
  131. _plot_distribution(plot, data=durations, labels=legend, plot_type=plot_type)
  132. if is_in_first_column and is_in_last_row:
  133. plot.set_ylabel("Duration (seconds)")
  134. if is_in_last_row:
  135. plot.set_xlabel("Concurrencies")
  136. if include_zero_in_y_axis:
  137. plot.set_ylim(ymin=0)
  138. title = endpoint_version_name
  139. if include_workload_in_title:
  140. title = f"{workload_name}, {title}"
  141. plot.set_title(title)
  142. def _shorten_endpoint_version_name(endpoint_version_name: str) -> str:
  143. if " " not in endpoint_version_name:
  144. return endpoint_version_name
  145. return endpoint_version_name.split(" ")[0]
  146. def _get_plot_marker(
  147. endpoint_version_name: str, baseline_version_name: str | None
  148. ) -> MarkerStyle:
  149. if (
  150. baseline_version_name is not None
  151. and endpoint_version_name == baseline_version_name
  152. ):
  153. return PLOT_MARKER_SQUARE
  154. return PLOT_MARKER_POINT
  155. def _compute_plot_grid(num_subplots: int, max_subplots_per_row: int) -> tuple[int, int]:
  156. num_rows = math.ceil(num_subplots / max_subplots_per_row)
  157. num_cols = math.ceil(num_subplots / num_rows)
  158. return num_rows, num_cols
  159. def _get_subplot_in_grid(
  160. subplots: Any,
  161. index: int,
  162. num_rows: int,
  163. num_cols: int,
  164. ) -> tuple[Axes, bool, bool]:
  165. use_no_grid = num_rows == 1 and num_cols == 1
  166. use_single_dimension = (num_rows == 1 and num_cols > 1) or (
  167. num_cols == 1 and num_rows > 1
  168. )
  169. if use_no_grid:
  170. plot: Axes = subplots
  171. is_in_first_column = True
  172. is_in_last_row = True
  173. elif use_single_dimension:
  174. plot: Axes = subplots[index]
  175. is_in_first_column = index == 0 or num_cols == 1
  176. is_in_last_row = index == (num_rows - 1) or num_rows == 1
  177. else:
  178. row = math.floor(index / num_cols)
  179. column = index % num_cols
  180. plot: Axes = subplots[row][column]
  181. is_in_first_column = column == 0
  182. is_in_last_row = row == (num_rows - 1)
  183. assert type(plot) == Axes
  184. return plot, is_in_first_column, is_in_last_row
  185. def _plot_distribution(
  186. plot: Axes,
  187. data: list[list[float]],
  188. labels: list[str],
  189. plot_type: DistributionPlotType,
  190. ) -> None:
  191. if plot_type == DistributionPlotType.VIOLIN:
  192. _plot_violinplot(plot, data, labels)
  193. elif plot_type == DistributionPlotType.BOX:
  194. _plot_boxplot(plot, data, labels)
  195. else:
  196. raise RuntimeError(f"Unexpected plot type: {plot_type}")
  197. def _plot_violinplot(plot: Axes, data: list[list[float]], labels: list[str]) -> None:
  198. xpos = np.arange(1, len(data) + 1)
  199. plot.violinplot(data)
  200. plot.set_xticks(xpos, labels=labels)
  201. for i, data_col in enumerate(data):
  202. quartile1, medians, quartile3 = np.percentile(
  203. data[i],
  204. [25, 50, 75],
  205. )
  206. # plot median line
  207. plot.scatter(
  208. xpos[i],
  209. medians,
  210. marker=PLOT_MARKER_HLINE,
  211. color=PLOT_COLOR_DARK_BLUE,
  212. s=300,
  213. )
  214. # plot 25% - 75% area
  215. plot.vlines(
  216. xpos[i],
  217. quartile1,
  218. quartile3,
  219. color=PLOT_COLOR_DARK_BLUE,
  220. linestyle="-",
  221. lw=5,
  222. )
  223. def _plot_boxplot(plot: Axes, data: list[list[float]], labels: list[str]) -> None:
  224. plot.boxplot(data, labels=labels)