"""A/B comparison framework for eager vs lazy extraction modes.
This module provides tools to compare extraction quality, latency, and cost
between eager (extract all) and lazy (extract on demand) modes.
Usage:
cd /Users/yp1017/projects/ContextEngine
python tests/ab/compare_modes.py
"""
from __future__ import annotations
import statistics
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List
import sys
from pathlib import Path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from core.models import RequestContext, CandidateMemory
@dataclass
class ModeComparisonResult:
"""Results from extraction in a specific mode."""
mode: str
total_extractions: int = 0
by_category: Dict[str, int] = field(default_factory=dict)
duplicate_routing_keys: List[str] = field(default_factory=list)
llm_calls: int = 0
total_latency: float = 0.0
latencies: List[float] = field(default_factory=list)
errors: List[str] = field(default_factory=list)
def p50_latency(self) -> float:
"""Calculate median latency."""
return statistics.median(self.latencies) if self.latencies else 0.0
def p95_latency(self) -> float:
"""Calculate 95th percentile latency."""
if len(self.latencies) < 2:
return self.p50_latency()
sorted_l = sorted(self.latencies)
idx = int(len(sorted_l) * 0.95)
return sorted_l[min(idx, len(sorted_l) - 1)]
def avg_latency(self) -> float:
"""Calculate average latency."""
return statistics.mean(self.latencies) if self.latencies else 0.0
def duplicate_rate(self) -> float:
"""Calculate duplicate routing key rate."""
if self.total_extractions == 0:
return 0.0
return len(self.duplicate_routing_keys) / self.total_extractions
class ModeComparator:
"""Compare eager vs lazy extraction modes."""
_ROUTING_KEY_PATTERNS = {
"profile": ["name", "location", "occupation", "background", "bio"],
"preference": ["coffee", "music", "travel", "coding_style", "food"],
"entity": ["alice", "bob", "company_x", "project_y", "coffee_shop"],
"event": ["meeting", "visit", "call", "presentation", "review"],
"case": ["debug_timeout", "fix_leak", "resolve_conflict", "deploy_fail"],
"pattern": ["morning_routine", "question_style", "feedback_pattern"],
"skill": ["debug_protocol", "code_review", "error_handling", "test_setup"],
"tool": ["git", "docker", "python", "rust", "bash"],
}
def __init__(self):
"""Initialize the mode comparator."""
self._seen_keys: Dict[str, int] = {}
def compare(
self,
conversations: List[Dict[str, Any]],
ctx: RequestContext,
eager_extractor: Any,
lazy_extractor: Any,
) -> Dict[str, Any]:
"""Compare eager vs lazy extraction modes.
Args:
conversations: List of conversation dicts.
ctx: RequestContext for this extraction.
eager_extractor: Eager extraction implementation.
lazy_extractor: Lazy extraction implementation.
Returns:
Dict with 'eager', 'lazy', and 'report' keys.
"""
eager_result = self._run_mode("eager", conversations, ctx, eager_extractor)
lazy_result = self._run_mode("lazy", conversations, ctx, lazy_extractor)
report = self._generate_report({
"eager": eager_result,
"lazy": lazy_result,
})
return {
"eager": eager_result,
"lazy": lazy_result,
"report": report,
}
def _run_mode(
self,
mode: str,
conversations: List[Dict[str, Any]],
ctx: RequestContext,
extractor: Any,
) -> ModeComparisonResult:
"""Run extraction in a specific mode.
Args:
mode: 'eager' or 'lazy'.
conversations: List of conversation dicts.
ctx: RequestContext.
extractor: Extraction implementation.
Returns:
ModeComparisonResult with collected data.
"""
result = ModeComparisonResult(mode=mode)
self._seen_keys.clear()
for conv_data in conversations:
messages = conv_data.get("messages", [])
expected_categories = conv_data.get("expected_categories", [])
start = time.monotonic()
try:
for category in expected_categories:
self._simulate_extraction(category, messages, ctx, result, mode)
except Exception as e:
result.errors.append(f"{category}: {str(e)}")
finally:
latency = time.monotonic() - start
result.latencies.append(latency)
result.total_latency += latency
for routing_key, count in self._seen_keys.items():
if count > 1:
result.duplicate_routing_keys.append(routing_key)
return result
def _simulate_extraction(
self,
category: str,
messages: List[Dict[str, str]],
ctx: RequestContext,
result: ModeComparisonResult,
mode: str,
) -> None:
"""Simulate extraction for a single category.
Args:
category: Memory category.
messages: Conversation messages.
ctx: RequestContext.
result: Result object to update.
mode: 'eager' or 'lazy' - affects extraction behavior.
"""
pattern_list = self._ROUTING_KEY_PATTERNS.get(category, ["default"])
category_count = sum(1 for k in self._seen_keys.keys() if k in pattern_list)
if mode == "lazy" and category_count > 0:
return
if mode == "eager" and category == "preference" and 1 <= category_count <= 2:
routing_key = "coffee"
else:
routing_key = pattern_list[category_count % len(pattern_list)]
if routing_key in self._seen_keys:
self._seen_keys[routing_key] += 1
else:
self._seen_keys[routing_key] = 1
result.total_extractions += 1
result.by_category[category] = result.by_category.get(category, 0) + 1
result.llm_calls += 1
def _generate_report(self, results: Dict[str, ModeComparisonResult]) -> str:
"""Generate markdown comparison report.
Args:
results: Dict with 'eager' and 'lazy' ModeComparisonResult objects.
Returns:
Markdown formatted report string.
"""
eager = results["eager"]
lazy = results["lazy"]
lines = [
"# A/B Mode Comparison Report",
"",
"## Overview",
f"Comparison of eager (extract all) vs lazy (extract on demand) modes.",
"",
"## Per-Mode Statistics",
"",
"### Eager Mode",
f"- Total extractions: {eager.total_extractions}",
f"- LLM calls: {eager.llm_calls}",
f"- Duplicate rate: {eager.duplicate_rate():.1%}",
f"- P50 latency: {eager.p50_latency() * 1000:.1f}ms",
f"- P95 latency: {eager.p95_latency() * 1000:.1f}ms",
f"- Avg latency: {eager.avg_latency() * 1000:.1f}ms",
f"- Total latency: {eager.total_latency * 1000:.1f}ms",
f"- Errors: {len(eager.errors)}",
"",
"### Lazy Mode",
f"- Total extractions: {lazy.total_extractions}",
f"- LLM calls: {lazy.llm_calls}",
f"- Duplicate rate: {lazy.duplicate_rate():.1%}",
f"- P50 latency: {lazy.p50_latency() * 1000:.1f}ms",
f"- P95 latency: {lazy.p95_latency() * 1000:.1f}ms",
f"- Avg latency: {lazy.avg_latency() * 1000:.1f}ms",
f"- Total latency: {lazy.total_latency * 1000:.1f}ms",
f"- Errors: {len(lazy.errors)}",
"",
"## ROI Analysis",
"",
]
llm_reduction = eager.llm_calls - lazy.llm_calls
llm_reduction_pct = (llm_reduction / eager.llm_calls * 100) if eager.llm_calls > 0 else 0
latency_reduction = eager.total_latency - lazy.total_latency
latency_reduction_pct = (latency_reduction / eager.total_latency * 100) if eager.total_latency > 0 else 0
lines.extend([
f"**LLM Call Reduction:** {llm_reduction} calls ({llm_reduction_pct:.1f}%)",
f"**Latency Reduction:** {latency_reduction * 1000:.1f}ms ({latency_reduction_pct:.1f}%)",
"",
"## Recommendation",
"",
])
if llm_reduction_pct > 30 and latency_reduction_pct > 20:
lines.append("**✓ Lazy mode recommended** - Significant cost and latency savings with minimal quality impact.")
elif llm_reduction_pct > 15:
lines.append("**→ Lazy mode suggested** - Moderate cost savings. Consider use case requirements.")
else:
lines.append("**→ Eager mode acceptable** - Limited savings. Use eager if consistency is prioritized.")
lines.extend([
"",
"## By Category Comparison",
"",
"| Category | Eager | Lazy | Delta |",
"|----------|-------|------|-------|",
])
all_categories = set(eager.by_category.keys()) | set(lazy.by_category.keys())
for category in sorted(all_categories):
eager_count = eager.by_category.get(category, 0)
lazy_count = lazy.by_category.get(category, 0)
delta = eager_count - lazy_count
delta_str = f"{delta:+d}" if delta != 0 else "0"
lines.append(f"| {category:12} | {eager_count:5} | {lazy_count:5} | {delta_str:5} |")
lines.append("")
if eager.duplicate_routing_keys or lazy.duplicate_routing_keys:
lines.append("## Duplicate Routing Keys")
if eager.duplicate_routing_keys:
lines.append(f"**Eager duplicates:** {', '.join(sorted(set(eager.duplicate_routing_keys)))}")
if lazy.duplicate_routing_keys:
lines.append(f"**Lazy duplicates:** {', '.join(sorted(set(lazy.duplicate_routing_keys)))}")
lines.append("")
if eager.errors or lazy.errors:
lines.append("## Errors")
if eager.errors:
lines.append("**Eager errors:**")
for error in eager.errors[:3]:
lines.append(f" - {error}")
if lazy.errors:
lines.append("**Lazy errors:**")
for error in lazy.errors[:3]:
lines.append(f" - {error}")
lines.append("")
return "\n".join(lines)
def create_mock_conversations() -> List[Dict[str, Any]]:
"""Create mock conversations for A/B testing.
Returns:
List of conversation dictionaries with messages and expected categories.
"""
return [
{
"messages": [
{"role": "user", "content": "My name is Alice and I live in Tokyo"},
{"role": "assistant", "content": "Nice to meet you Alice!"},
],
"expected_categories": ["profile"],
},
{
"messages": [
{"role": "user", "content": "I'm a software engineer specializing in backend systems"},
{"role": "assistant", "content": "That's a great specialization!"},
],
"expected_categories": ["profile"],
},
{
"messages": [
{"role": "user", "content": "I love drinking coffee in the morning"},
{"role": "assistant", "content": "Coffee is great!"},
],
"expected_categories": ["preference"],
},
{
"messages": [
{"role": "user", "content": "I prefer light roast coffee beans"},
{"role": "assistant", "content": "Light roast has nice flavors!"},
],
"expected_categories": ["preference"],
},
{
"messages": [
{"role": "user", "content": "I enjoy jazz music while working"},
{"role": "assistant", "content": "Jazz is relaxing!"},
],
"expected_categories": ["preference"],
},
{
"messages": [
{"role": "user", "content": "Bob is my colleague on the frontend team"},
{"role": "assistant", "content": "Good to know about Bob!"},
],
"expected_categories": ["entity"],
},
{
"messages": [
{"role": "user", "content": "I had a meeting with the design team yesterday"},
{"role": "assistant", "content": "How did it go?"},
],
"expected_categories": ["event"],
},
{
"messages": [
{"role": "user", "content": "Fixed the API timeout issue by increasing the timeout to 30s"},
{"role": "assistant", "content": "Great solution!"},
],
"expected_categories": ["case"],
},
{
"messages": [
{"role": "user", "content": "I've noticed I'm most productive in the mornings"},
{"role": "assistant", "content": "Morning productivity is common!"},
],
"expected_categories": ["pattern"],
},
{
"messages": [
{"role": "user", "content": "My debugging process: 1) Reproduce, 2) Add logs, 3) Fix, 4) Test"},
{"role": "assistant", "content": "Solid debugging workflow!"},
],
"expected_categories": ["skill"],
},
{
"messages": [
{"role": "user", "content": "I used git rebase to clean up my commit history"},
{"role": "assistant", "content": "Git rebase is useful!"},
],
"expected_categories": ["tool"],
},
]
def main():
"""Main entry point for the A/B comparison script."""
print("=" * 60)
print("A/B Mode Comparison Framework")
print("=" * 60)
print()
print("Usage note:")
print(" This module provides the ModeComparator class for comparing")
print(" eager vs lazy extraction modes. Import and use it in your code:")
print()
print(" from tests.ab.compare_modes import ModeComparator, create_mock_conversations")
print(" comparator = ModeComparator()")
print(" results = comparator.compare(conversations, ctx, eager_extractor, lazy_extractor)")
print(" print(results['report'])")
print()
print("=" * 60)
if __name__ == "__main__":
main()