from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
results_dir = Path(__file__).resolve().parent / "results"
csv_files = list(results_dir.glob("*.csv"))
if not csv_files:
raise FileNotFoundError(f"No CSV files found in {results_dir}")
size_order = ["Small", "Medium", "Large"]
BACKEND_DISPLAY_NAMES = {
"simplestorage": "SimpleStorage",
"yuanrong": "openYuanrong",
"mooncakestore": "MooncakeStore",
"ray_baseline": "Ray",
}
def format_size(size_gb: float) -> str:
"""Format a data size in GB to a human-readable string with appropriate unit."""
if size_gb >= 1.0:
return f"{size_gb:.2f} GB"
size_mb = size_gb * 1024
if size_mb >= 1.0:
return f"{size_mb:.2f} MB"
size_kb = size_mb * 1024
return f"{size_kb:.2f} KB"
dfs = []
for csv_file in csv_files:
df = pd.read_csv(csv_file)
stem = csv_file.stem
parts = stem.rsplit("_", 1)
if len(parts) != 2:
print(f"Warning: skipping {csv_file.name}, unexpected filename format")
continue
raw_backend, raw_size = parts
size_label = raw_size.capitalize()
if size_label not in size_order:
print(f"Warning: skipping {csv_file.name}, unrecognized size label '{raw_size}'")
continue
df["backend_parsed"] = BACKEND_DISPLAY_NAMES.get(raw_backend, raw_backend)
df["size_label"] = size_label
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
existing_sizes = [s for s in size_order if s in df["size_label"].unique()]
size_to_gb = df.groupby("size_label")["total_data_size_gb"].first().to_dict()
def make_xlabel(size_label: str) -> str:
return f"{size_label}\n{format_size(size_to_gb.get(size_label, 0))}"
df["X_label"] = df["size_label"].apply(make_xlabel)
df["X_label"] = pd.Categorical(
df["X_label"],
categories=[make_xlabel(s) for s in existing_sizes],
ordered=True,
)
df["Bandwidth"] = df["total_gbit_per_sec"]
df["Scenario"] = df["backend_parsed"]
preferred_backend_order = ["Ray", "SimpleStorage", "openYuanrong", "MooncakeStore"]
actual_backends = df["Scenario"].unique().tolist()
backend_order = [b for b in preferred_backend_order if b in actual_backends]
backend_order += [b for b in actual_backends if b not in preferred_backend_order]
df["Scenario"] = pd.Categorical(df["Scenario"], categories=backend_order, ordered=True)
sns.set_theme(style="white", palette="husl")
fig, ax = plt.subplots(figsize=(12, 7))
palette = sns.color_palette("Set2", n_colors=len(backend_order))
barplot = sns.barplot(data=df, x="X_label", y="Bandwidth", hue="Scenario", ax=ax, alpha=0.8, palette=palette)
handles, labels = ax.get_legend_handles_labels()
ax.get_legend().remove()
fig.legend(
handles,
labels,
bbox_to_anchor=(0.5, 1.0),
loc="upper center",
ncol=len(handles),
title="",
frameon=True,
fancybox=True,
shadow=True,
fontsize=13,
)
for p in ax.patches:
height = p.get_height()
if height > 0:
ax.annotate(
f"{height:.3f}",
(p.get_x() + p.get_width() / 2.0, height),
ha="center",
va="bottom",
fontsize=11,
rotation=0,
)
ax.set_title("Performance Comparison (Total Throughput)", fontsize=16, fontweight="bold")
ax.set_xlabel("")
ax.set_ylabel("Bandwidth (Gbps)", fontsize=16)
y_max = df["Bandwidth"].max() * 1.15
ax.set_ylim(0, y_max)
ax.grid(True, alpha=0.3)
ax.tick_params(axis="x", labelsize=14)
ax.tick_params(axis="y", labelsize=13)
fig.text(0.5, 0.02, "Data Volume", ha="center", fontsize=20)
plt.tight_layout(rect=[0, 0.04, 1, 0.95])
plt.savefig(results_dir / "performance_comparison.pdf", dpi=300, bbox_inches="tight")
plt.show()
print("Performance comparison plot generated and saved as 'performance_comparison.pdf'")