123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- # Copyright Materialize, Inc. and contributors. All rights reserved.
- #
- # Use of this software is governed by the Business Source License
- # included in the LICENSE file at the root of this repository.
- #
- # As of the Change Date specified in that file, in accordance with
- # the Business Source License, use of this software will be governed
- # by the Apache License, Version 2.0.
- import math
- from enum import Enum
- from typing import Any
- import numpy as np
- from matplotlib.axes import Axes
- from matplotlib.figure import SubFigure
- from matplotlib.markers import MarkerStyle
- from materialize.scalability.df.df_details import DfDetails
- from materialize.scalability.df.df_totals import DfTotals
- from materialize.scalability.endpoint.endpoints import endpoint_name_to_description
- PLOT_MARKER_POINT = MarkerStyle("o")
- PLOT_MARKER_SQUARE = MarkerStyle(",")
- PLOT_MARKER_HLINE = MarkerStyle("_")
- PLOT_COLOR_DARK_BLUE = "darkblue"
- class DistributionPlotType(Enum):
- VIOLIN = 1
- BOX = 2
- DEFAULT_DISTRIBUTION_PLOT_TYPE = DistributionPlotType.VIOLIN
- def plot_tps_per_connections(
- workload_name: str,
- figure: SubFigure,
- df_totals_by_endpoint_name: dict[str, DfTotals],
- baseline_version_name: str | None,
- include_zero_in_y_axis: bool,
- include_workload_in_title: bool = False,
- ) -> None:
- """This uses a scatter plot to plot the TPS per connections."""
- legend = []
- plot: Axes = figure.subplots(1, 1)
- max_concurrency = 1
- for endpoint_version_name, df_totals in df_totals_by_endpoint_name.items():
- legend.append(endpoint_name_to_description(endpoint_version_name))
- plot.scatter(
- df_totals.get_concurrency_values(),
- df_totals.get_tps_values(),
- label="tps",
- marker=_get_plot_marker(endpoint_version_name, baseline_version_name),
- )
- max_concurrency = max(max_concurrency, df_totals.get_max_concurrency())
- plot.set_ylabel("Transactions Per Second (tps)")
- plot.set_xlabel("Concurrent SQL Connections")
- if include_zero_in_y_axis:
- plot.set_ylim(ymin=0)
- if include_workload_in_title:
- plot.set_title(workload_name)
- plot.legend(legend)
- def plot_duration_by_connections_for_workload(
- workload_name: str,
- figure: SubFigure,
- df_details_by_endpoint_name: dict[str, DfDetails],
- include_zero_in_y_axis: bool,
- include_workload_in_title: bool = False,
- plot_type: DistributionPlotType = DEFAULT_DISTRIBUTION_PLOT_TYPE,
- ) -> None:
- """This uses a boxplot or violin plot for the distribution of the duration."""
- if len(df_details_by_endpoint_name) == 0:
- return
- concurrencies = next(
- iter(df_details_by_endpoint_name.values())
- ).get_unique_concurrency_values()
- endpoint_version_names = df_details_by_endpoint_name.keys()
- use_short_names = len(endpoint_version_names) > 2
- num_rows, num_cols = _compute_plot_grid(len(concurrencies), 3)
- subplots = figure.subplots(num_rows, num_cols, sharey=False)
- for concurrency_index, concurrency in enumerate(concurrencies):
- plot, is_in_first_column, is_in_last_row = _get_subplot_in_grid(
- subplots, concurrency_index, num_rows, num_cols
- )
- legend = []
- durations: list[list[float]] = []
- for endpoint_version_name, df_details in df_details_by_endpoint_name.items():
- df_details_of_concurrency = df_details.to_filtered_by_concurrency(
- concurrency
- )
- if not df_details_of_concurrency.has_values():
- continue
- durations.append(df_details_of_concurrency.get_wallclock_values())
- formatted_endpoint_name = (
- endpoint_version_name
- if not use_short_names
- else _shorten_endpoint_version_name(endpoint_version_name)
- )
- legend.append(formatted_endpoint_name)
- _plot_distribution(plot, data=durations, labels=legend, plot_type=plot_type)
- if is_in_first_column:
- plot.set_ylabel("Duration (seconds)")
- if include_zero_in_y_axis:
- plot.set_ylim(ymin=0)
- title = f"{concurrency} connections"
- if include_workload_in_title:
- title = f"{workload_name}, {title}"
- plot.set_title(title)
- def plot_duration_by_endpoints_for_workload(
- workload_name: str,
- figure: SubFigure,
- df_details_by_endpoint_name: dict[str, DfDetails],
- include_zero_in_y_axis: bool,
- include_workload_in_title: bool = False,
- plot_type: DistributionPlotType = DEFAULT_DISTRIBUTION_PLOT_TYPE,
- ) -> None:
- """This uses a boxplot or violin plot for the distribution of the duration."""
- if len(df_details_by_endpoint_name) == 0:
- return
- num_rows, num_cols = _compute_plot_grid(len(df_details_by_endpoint_name.keys()), 1)
- subplots = figure.subplots(num_rows, num_cols, sharey=False)
- for endpoint_index, (endpoint_version_name, df_details) in enumerate(
- df_details_by_endpoint_name.items()
- ):
- plot, is_in_first_column, is_in_last_row = _get_subplot_in_grid(
- subplots, endpoint_index, num_rows, num_cols
- )
- concurrencies = df_details.get_unique_concurrency_values()
- legend = []
- durations: list[list[float]] = []
- for concurrency in concurrencies:
- df_details_of_concurrency = df_details.to_filtered_by_concurrency(
- concurrency
- )
- if not df_details_of_concurrency.has_values():
- continue
- durations.append(df_details_of_concurrency.get_wallclock_values())
- legend.append(concurrency)
- _plot_distribution(plot, data=durations, labels=legend, plot_type=plot_type)
- if is_in_first_column and is_in_last_row:
- plot.set_ylabel("Duration (seconds)")
- if is_in_last_row:
- plot.set_xlabel("Concurrencies")
- if include_zero_in_y_axis:
- plot.set_ylim(ymin=0)
- title = endpoint_version_name
- if include_workload_in_title:
- title = f"{workload_name}, {title}"
- plot.set_title(title)
- def _shorten_endpoint_version_name(endpoint_version_name: str) -> str:
- if " " not in endpoint_version_name:
- return endpoint_version_name
- return endpoint_version_name.split(" ")[0]
- def _get_plot_marker(
- endpoint_version_name: str, baseline_version_name: str | None
- ) -> MarkerStyle:
- if (
- baseline_version_name is not None
- and endpoint_version_name == baseline_version_name
- ):
- return PLOT_MARKER_SQUARE
- return PLOT_MARKER_POINT
- def _compute_plot_grid(num_subplots: int, max_subplots_per_row: int) -> tuple[int, int]:
- num_rows = math.ceil(num_subplots / max_subplots_per_row)
- num_cols = math.ceil(num_subplots / num_rows)
- return num_rows, num_cols
- def _get_subplot_in_grid(
- subplots: Any,
- index: int,
- num_rows: int,
- num_cols: int,
- ) -> tuple[Axes, bool, bool]:
- use_no_grid = num_rows == 1 and num_cols == 1
- use_single_dimension = (num_rows == 1 and num_cols > 1) or (
- num_cols == 1 and num_rows > 1
- )
- if use_no_grid:
- plot: Axes = subplots
- is_in_first_column = True
- is_in_last_row = True
- elif use_single_dimension:
- plot: Axes = subplots[index]
- is_in_first_column = index == 0 or num_cols == 1
- is_in_last_row = index == (num_rows - 1) or num_rows == 1
- else:
- row = math.floor(index / num_cols)
- column = index % num_cols
- plot: Axes = subplots[row][column]
- is_in_first_column = column == 0
- is_in_last_row = row == (num_rows - 1)
- assert type(plot) == Axes
- return plot, is_in_first_column, is_in_last_row
- def _plot_distribution(
- plot: Axes,
- data: list[list[float]],
- labels: list[str],
- plot_type: DistributionPlotType,
- ) -> None:
- if plot_type == DistributionPlotType.VIOLIN:
- _plot_violinplot(plot, data, labels)
- elif plot_type == DistributionPlotType.BOX:
- _plot_boxplot(plot, data, labels)
- else:
- raise RuntimeError(f"Unexpected plot type: {plot_type}")
- def _plot_violinplot(plot: Axes, data: list[list[float]], labels: list[str]) -> None:
- xpos = np.arange(1, len(data) + 1)
- plot.violinplot(data)
- plot.set_xticks(xpos, labels=labels)
- for i, data_col in enumerate(data):
- quartile1, medians, quartile3 = np.percentile(
- data[i],
- [25, 50, 75],
- )
- # plot median line
- plot.scatter(
- xpos[i],
- medians,
- marker=PLOT_MARKER_HLINE,
- color=PLOT_COLOR_DARK_BLUE,
- s=300,
- )
- # plot 25% - 75% area
- plot.vlines(
- xpos[i],
- quartile1,
- quartile3,
- color=PLOT_COLOR_DARK_BLUE,
- linestyle="-",
- lw=5,
- )
- def _plot_boxplot(plot: Axes, data: list[list[float]], labels: list[str]) -> None:
- plot.boxplot(data, labels=labels)
|