Compute weighted scor
... [OUTPUT TRUNCATED - 6747 chars omitted out of 56747 total] ...
predictions=predictions,
windows=windows,
config=cfg,
method="multi_agent_debate",
)
for ww, pp in zip(windows, predictions):
if ww.is_directional and pp.predicted_direction != "UNCERTAIN":
pp.is_correct = (pp.predicted_direction == ww.ground_truth)
else:
pp.is_correct = None
acc = result.directional_accuracy
br = result.brier_score
dpairs = result.directional_windows
cr = CalibrationResult(
weights=w_combo,
accuracy=acc,
brier=br,
num_directional=len(dpairs),
)
all_results.append(cr)
if acc > best_accuracy and len(dpairs) >= 30:
best_accuracy = acc
best_result = cr
if verbose and eval_count % 500 == 0:
print(f" Evaluated {eval_count}/{len(combinations)}... current best: acc={best_accuracy:.3f}")
if best_result is None:
best_result = max(all_results, key=lambda r: r.accuracy)
if verbose:
print(f"\nGrid search complete")
print(f" Optimal weights: L={best_result.weights[0]:.2f} E={best_result.weights[1]:.2f} "
f"C={best_result.weights[2]:.2f} P={best_result.weights[3]:.2f}")
print(f" Optimal accuracy: {best_result.accuracy:.1%}")
return best_result, all_results
(continued — Statistical tests)
# ═══════════════════════════════════════════════════════════
# Statistical Tests
# ═══════════════════════════════════════════════════════════
def mcnemar_test(
debate_result: BacktestRunResult,
baseline_result: BacktestRunResult,
) -> Dict[str, Any]:
"""McNemar's test: debate vs baseline comparison."""
a = b = c = d = 0
for (w1, p1), (w2, p2) in zip(
zip(debate_result.windows, debate_result.predictions),
zip(baseline_result.windows, baseline_result.predictions),
):
if not w1.is_directional or p1.predicted_direction == "UNCERTAIN":
continue
if not w2.is_directional or p2.predicted_direction == "UNCERTAIN":
continue
d_correct = p1.is_correct
b_correct = p2.is_correct
if d_correct and b_correct:
a += 1
elif d_correct and not b_correct:
b += 1
elif not d_correct and b_correct:
c += 1
else:
d += 1
if b + c == 0:
chi2 = 0.0
p_value = 1.0
else:
chi2 = (abs(b - c) - 1) ** 2 / (b + c)
p_value = 2 * (1 - _chi2_cdf(chi2, 1)) if chi2 > 0 else 1.0
return {
"table": {"both_correct": a, "debate_only": b, "baseline_only": c, "both_wrong": d},
"chi2": round(chi2, 4),
"p_value": round(p_value, 4),
"significant": p_value < 0.05,
}
def binomial_test(result: BacktestRunResult) -> Dict[str, Any]:
"""Binomial test: is accuracy significantly above random (50%)?"""
pairs = result.directional_windows
n = len(pairs)
k = sum(1 for w, p in pairs if p.is_correct)
p_value = 0.0
for i in range(k, n + 1):
p_value += math.comb(n, i) * (0.5 ** n)
return {
"n": n,
"k_correct": k,
"observed_accuracy": k / n if n > 0 else 0,
"p_value": round(p_value, 6),
"significant": p_value < 0.05,
}
def bootstrap_ci(
result: BacktestRunResult,
num_iterations: int = 10000,
) -> Dict[str, Any]:
"""Bootstrap confidence interval."""
pairs = result.directional_windows
n = len(pairs)
if n == 0:
return {"median": 0, "ci_95": [0, 0]}
correct = np.array([p.is_correct for w, p in pairs], dtype=float)
rng = np.random.default_rng(42)
accuracies = []
for _ in range(num_iterations):
idx = rng.integers(0, n, n)
sample_correct = correct[idx]
acc = np.mean(sample_correct)
accuracies.append(acc)
accuracies = np.array(accuracies)
return {
"median": round(float(np.median(accuracies)), 4),
"ci_95": [round(float(np.percentile(accuracies, 2.5)), 4),
round(float(np.percentile(accuracies, 97.5)), 4)],
}
def _chi2_cdf(x: float, df: int) -> float:
"""Approximate chi-squared CDF."""
if x <= 0:
return 0.0
if df == 1:
return 2 * _norm_cdf(math.sqrt(x)) - 1
return _norm_cdf(((x / df) ** (1/3) - (1 - 2/(9*df))) / math.sqrt(2/(9*df)))
def _norm_cdf(x: float) -> float:
"""Standard normal CDF."""
return 0.5 * (1 + math.erf(x / math.sqrt(2)))
(continued — Main function & output)
# ═══════════════════════════════════════════════════════════
# Main Function
# ═══════════════════════════════════════════════════════════
def print_report(
debate_result: BacktestRunResult,
baseline_result: BacktestRunResult,
calibration: CalibrationResult,
stats: Dict[str, Any],
):
"""Print a formatted backtest report."""
print("\n" + "=" * 70)
print("Multi-Agent Debate System — Backtest Validation Report")
print("=" * 70)
gt_dist = GroundTruth.distribution(debate_result.windows)
print(f"\nData Overview")
print(f" Total windows: {len(debate_result.windows)}")
print(f" UP windows: {gt_dist['UP']}")
print(f" DOWN windows: {gt_dist['DOWN']}")
print(f" FLAT windows: {gt_dist['FLAT']} (excluded from accuracy)")
print(f"\nDirectional Accuracy (threshold +/-1%)")
print(f" Multi-Agent Debate: {debate_result.directional_accuracy:.1%} ({len(debate_result.directional_windows)} directional windows)")
print(f" Single-Agent Baseline: {baseline_result.directional_accuracy:.1%} ({len(baseline_result.directional_windows)} directional windows)")
diff = debate_result.directional_accuracy - baseline_result.directional_accuracy
print(f" Delta (Debate - Baseline): {diff:+.1%}")
print(f"\nBrier Score (lower is better)")
print(f" Multi-Agent Debate: {debate_result.brier_score:.4f}")
print(f" Single-Agent Baseline: {baseline_result.brier_score:.4f}")
print(f" Random Baseline: 0.2500")
print(f" Delta (Baseline - Debate): {baseline_result.brier_score - debate_result.brier_score:+.4f}")
print(f"\nConfidence Calibration (Debate System)")
bins = debate_result.confidence_bins()
for bin_key in sorted(bins.keys()):
b = bins[bin_key]
if b["count"] > 0:
bar = "#" * int(b["accuracy"] * 20)
print(f" {bin_key}: {b['accuracy']:.1%} ({b['correct']}/{b['count']}) {bar}")
print(f"\nJudge Weight Calibration")
w = calibration.weights
print(f" Default weights: L=0.30 E=0.30 C=0.20 P=0.20")
print(f" Optimal weights: L={w[0]:.2f} E={w[1]:.2f} C={w[2]:.2f} P={w[3]:.2f}")
print(f" Optimal accuracy: {calibration.accuracy:.1%}")
print(f"\nStatistical Significance Tests")
m = stats["mcnemar"]
print(f" McNemar's test: chi2={m['chi2']:.3f}, p={m['p_value']:.4f} {'[SIGNIFICANT]' if m['significant'] else '[NOT SIGNIFICANT - need larger sample]'}")
print(f" Contingency table: both_correct={m['table']['both_correct']}, "
f"debate_only={m['table']['debate_only']}, "
f"baseline_only={m['table']['baseline_only']}, "
f"both_wrong={m['table']['both_wrong']}")
bn = stats["binomial_debate"]
print(f" Binomial test (debate vs random): p={bn['p_value']:.6f} {'[HIGHLY SIGNIFICANT]' if bn['significant'] else ''}")
bt = stats["bootstrap_debate"]
print(f" Bootstrap CI (debate): median={bt['median']:.1%}, 95% CI=[{bt['ci_95'][0]:.1%}, {bt['ci_95'][1]:.1%}]")
bbl = stats["bootstrap_baseline"]
print(f" Bootstrap CI (baseline): median={bbl['median']:.1%}, 95% CI=[{bbl['ci_95'][0]:.1%}, {bbl['ci_95'][1]:.1%}]")
print(f"\n" + "-" * 70)
print(f"Synthesis")
print("-" * 70)
if debate_result.directional_accuracy > baseline_result.directional_accuracy:
print(f" [OK] Debate system outperforms single-agent baseline on directional accuracy ({diff:+.1%})")
else:
print(f" [--] Debate system does not outperform single-agent baseline on directional accuracy ({diff:+.1%})")
if debate_result.brier_score < baseline_result.brier_score:
print(f" [OK] Debate system outperforms baseline on probability calibration")
if bn["significant"]:
print(f" [OK] Debate accuracy is significantly above random guessing")
if not m["significant"]:
print(f" [WARN] Debate vs baseline difference is not statistically significant — recommend increasing sample to 500+")
print(f"\n [WARN] All results are based on synthetic data. Do not represent actual market performance.")
print(f"\n" + "=" * 70)
def export_results(
debate_result: BacktestRunResult,
baseline_result: BacktestRunResult,
filename: str = "backtest_results.json",
):
"""Export backtest results as JSON."""
output = {
"meta": {
"generated_at": datetime.now().isoformat(),
"data_type": "synthetic",
"warning": "Synthetic data — not for real investment decisions",
"num_windows": len(debate_result.windows),
},
"debate": {
"accuracy": debate_result.directional_accuracy,
"brier": debate_result.brier_score,
"num_directional": len(debate_result.directional_windows),
},
"baseline": {
"accuracy": baseline_result.directional_accuracy,
"brier": baseline_result.brier_score,
"num_directional": len(baseline_result.directional_windows),
},
}
with open(filename, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"Results exported to: {filename}")
# ═══════════════════════════════════════════════════════════
# Entry Point
# ═══════════════════════════════════════════════════════════
if __name__ == "__main__":
print("=" * 70)
print("Multi-Agent Debate x Market Analysis — Backtest Validation Engine")
print("=" * 70)
config = BacktestConfig(
num_windows=100,
window_spacing_days=5,
forward_look_days=20,
)
print("\n[1/7] Generating synthetic historical data...")
history = generate_synthetic_history(num_days=700, seed=config.random_seed)
print(f" Generated {len(history)} historical snapshots")
print("\n[2/7] Generating backtest windows...")
windows = generate_windows(history, config)
print(f" Generated {len(windows)} backtest windows")
gt_dist = GroundTruth.distribution(windows)
print(f" Direction distribution: UP={gt_dist['UP']}, DOWN={gt_dist['DOWN']}, FLAT={gt_dist['FLAT']}")
print("\n[3/7] Running multi-agent debate backtest...")
runner = BacktestRunner(config)
debate_result = runner.run(windows, method="multi_agent_debate")
print(f" Complete: accuracy={debate_result.directional_accuracy:.1%}, "
f"Brier={debate_result.brier_score:.4f}")
print("\n[4/7] Running single-agent baseline...")
baseline_result = runner.run(windows, method="single_agent_baseline")
print(f" Complete: accuracy={baseline_result.directional_accuracy:.1%}, "
f"Brier={baseline_result.brier_score:.4f}")
print("\n[5/7] Judge weight grid search calibration...")
calibrator = JudgeCalibrator(config)
best_calibration, all_calibrations = calibrator.calibrate(windows, verbose=True)
print("\n[6/7] Running statistical tests...")
stats = {
"mcnemar": mcnemar_test(debate_result, baseline_result),
"binomial_debate": binomial_test(debate_result),
"binomial_baseline": binomial_test(baseline_result),
"bootstrap_debate": bootstrap_ci(debate_result),
"bootstrap_baseline": bootstrap_ci(baseline_result),
}
print("\n[7/7] Generating report...")
print_report(debate_result, baseline_result, best_calibration, stats)
export_results(debate_result, baseline_result)
print(f"\nBacktest validation engine complete.")
print(f"Disclaimer: All data in this article and code is synthetic/fictional. Not financial advice.")
Running the Backtest Engine
# Install dependencies
pip install numpy
# Run backtest (uses synthetic data)
python backtest_engine.py
To integrate the backtest engine with the real debate protocol engine from Article 2's debate_protocol.py, replace SimulatedDebateRunner.simulate_debate() with calls to the real LLM debate. The rest of the framework—window management, metric computation, statistical tests—remains unchanged.
Synthetic Backtest Results: Visual Interpretation of 100 Debates
Here is the typical output after running 100 backtests with synthetic data (consistent with our earlier theoretical discussion):
Overall Results
| Metric |
Multi-Agent Debate |
Single-Agent Baseline |
Random |
| Directional Accuracy |
71.1% |
62.0% |
50.0% |
| Brier Score |
0.187 |
0.228 |
0.250 |
| Valid Predictions (directional windows) |
83 |
80 |
— |
The multi-agent debate leads the single-agent baseline by approximately 9 percentage points on directional accuracy and by about 18% on Brier score. The debate system produced fewer "UNCERTAIN" judgments—meaning it is better than the single agent at making clear directional calls when a signal exists.
Quadrant Analysis
Breaking the 100 predictions into four quadrants reveals the system's strengths and weaknesses:
| Quadrant |
Definition |
Count |
Interpretation |
| True Positives (TP) |
Predicted UP, actual UP |
34 |
System correctly identified upside opportunities |
| True Negatives (TN) |
Predicted DOWN, actual DOWN |
25 |
System correctly identified downside risks |
| False Positives (FP) |
Predicted UP, actual DOWN |
16 |
Bullish signal failed in a declining market |
| False Negatives (FN) |
Predicted DOWN, actual UP |
8 |
Bearish signal failed in a rising market (fewer errors—system leans bullish) |
A noteworthy finding: the system has fewer false negatives (FN=8) than false positives (FP=16). This means the debate system tends toward bullish predictions—when the market rises, it missed 8 opportunities; when the market falls, it incorrectly predicted up 16 times. This bullish bias may stem from the positive drift in the synthetic data (daily +0.03%), but it is also a hypothesis worth testing on real data.
Common Pitfalls
Pitfall 1: Look-Ahead Bias
Problem: When generating "historical" knowledge base snapshots, you accidentally include data beyond the snapshot date. For example, if the backtest window is anchored at 2024-03-15, but the knowledge base's "200-day returns" are calculated through 2024-06-01—the debate sees the future.
Solution: Our generate_synthetic_history() function strictly computes all indicators using data only up to the anchor date. In a real system, you must ensure the data pipeline's temporal cutoff logic is correct—each backtest's build_knowledge_base() must accept an as_of_date parameter.
Pitfall 2: Survivorship Bias
Problem: If historical data only includes indices that "survived"—ignoring those that delisted or performed extremely poorly—backtest results will systematically overstate accuracy.
Solution: This article uses entirely synthetic data and is therefore unaffected. In a real data system, you must ensure the knowledge base includes historical records of delisted indices and correctly incorporates them in backtests.
Pitfall 3: Grid Search Overfitting
Problem: When searching 4,000+ weight combinations for the "optimal" weights, you are very likely to find a combination that performed best across 100 windows purely by luck—not because it is genuinely better.
Solution: Split backtest windows into a training set (70%) and validation set (30%). Run grid search on the training set, evaluate the optimal weights on the validation set. If validation accuracy diverges significantly from training accuracy—you are overfitting.
# Train/validation split
train_windows = windows[:70]
val_windows = windows[70:]
# Grid search on training set
best_cal, _ = calibrator.calibrate(train_windows)
# Evaluate on validation set
cfg_val = BacktestConfig(judge_weights=best_cal.weights)
val_runner = BacktestRunner(cfg_val)
val_result = val_runner.run(val_windows)
print(f"Validation accuracy: {val_result.directional_accuracy:.1%}")
Pitfall 4: Treating Synthetic Backtest Results as Real Performance
Problem: All 71.1% accuracy figures in this article come from synthetic data—they illustrate methodology, not actual market prediction capability. On real market data, accuracy may differ significantly.
Solution: Treat this framework as a validation tool—not a "prove the system works" machine, but a "rigorously test whether the system works" machine. Run the same backtest on real data with the same metrics. If real accuracy approaches 50%—face that result honestly. That is the value of backtesting.
Key Takeaways
- Backtesting is not a one-time validation—it is an ongoing engineering practice. Every time you modify the debate protocol, adjust prompts, or switch models, re-run the backtest. Without backtesting, your judgment of system quality is merely guesswork.
- The multi-agent debate outperforms the single-agent baseline on synthetic data. +9 percentage points on directional accuracy, 18% improvement on Brier score. This improvement comes from the adversarial process—not from more information (each agent in the debate only sees partial data).
- Evidence quality is the most important judge scoring dimension. The grid search bumped Evidence from 30% to 40% and dropped Clarity from 20% to 5%—indicating that "does the argument contain specific, quantifiable data" is far more important than "is the argument clearly expressed."
- 100 backtests are insufficient for statistically significant conclusions. McNemar's test yields p = 0.137—not significant at the 5% level. 500+ backtests are needed to distinguish a 4-percentage-point accuracy difference with high confidence.
- Synthetic backtesting validates methodology—not market prediction capability. 71.1% accuracy demonstrates the value of adversarial debate on synthetic data. On real market data, the same framework will produce different numbers—potentially higher, potentially lower. The key: you now have the tools to measure it.
Article 4 Teaser: Production Deployment
You now have a backtest-validated debate system. You know it outperforms a single-agent baseline. You know the calibrated judge weights. You know which confidence ranges are reliable and which are not.
But validation is only the first step. Next: put it into production.
In Article 4, we will tackle every engineering problem involved in real deployment:
- Scheduled execution: Every morning before market open, automatically fetch the prior day's data, build the knowledge base, and run the debate—no manual triggers needed.
- Persistent storage: Write each debate's complete transcript, judge scores, and prediction results to a database. Build a queryable history of analyses.
- Monitoring dashboard: Track accuracy trends in real-time, confidence distribution shifts, judge score drift—get alerted when system performance begins to degrade.
- Cost optimization: 9 LLM calls × daily = a substantial API bill. Explore caching strategies, model tiering (mini for daily, full for critical days), and batching optimizations.
- Error recovery: When an agent's API call fails, how do you handle it? Is a partial debate still valid? What is the fallback strategy?
But before then—run this article's code. Generate 100 backtests with synthetic data first. Read the accuracy report. Ask yourself: if I change the judge's scoring dimensions, how does accuracy change? If I change the debate temperature settings? If I add more agents?
The backtest framework is not just for "validation"—it is your experimental platform for optimizing the system.
📖 Previous: Multi-Agent Debate × Market Analysis — The Debate Protocol (8-agent debate engine)
📖 Architecture: Multi-Agent Debate × Market Analysis — Architecture & Data Pipeline
📖 Debate Theory: Multi-Agent Debate L3: Scoring & Consensus Theory
📖 Next: Article 4 — Production Deployment (coming soon)
⚠️ Disclaimer: This article is a technical workflow demonstration, not financial advice. All market data, index names (ExampleIndex), prices, returns, and accuracy figures in this article are synthetic/fictional. Backtest results cannot and should not be used as the basis for actual investment decisions. The multi-agent debate system is an engineering technology demonstration—its output should under no circumstances be considered market prediction or trading advice. Financial markets carry inherent risk. Consult a licensed financial professional before making any investment decisions.
Frequently Asked Questions
Q: Why use synthetic data for backtesting instead of real market data? Is this even meaningful?
A: Synthetic data backtesting doesn't aim to "prove the system can predict markets" — it validates the methodology. It proves the backtesting framework itself is correct, the statistical tests work, and the weight calibration flow is sound. Synthetic data provides a controlled "known answer" environment: you know exactly how the data was generated, so you can judge whether the backtest correctly identified the signal. On real market data, you face an unknown data-generating process — you can't distinguish "the system found real signal" from "the system overfit noise." Validate the framework on synthetic data first, then evaluate the system on real data — two-layer validation, not one-shot.
Q: What does the ±1% threshold for directional accuracy mean? Why not just use up/down?
A: Real market moves are rarely "pure up" or "pure down." If a window's market only wiggled by 0.3%, claiming "direction was correct" would be dishonest — a coin flip could claim the same. The ±1% threshold restricts "directionality" to meaningful moves: gain ≥ +1% = UP, loss ≤ -1% = DOWN, everything in between = FLAT (excluded from accuracy calculation). This makes accuracy measurement stricter — you only evaluate the system on windows with clear directional signals. In this article's synthetic data, roughly 17% of windows are marked FLAT.
Q: Won't grid search for judge weight calibration overfit? How do you prevent that?
A: When searching 4000+ weight combinations, you're very likely to find a combination that performed best across 100 windows purely by luck — not because it's genuinely better. Prevention: split backtest windows into a training set (70%) and validation set (30%). Run grid search on the training set, evaluate optimal weights on the validation set. If validation accuracy diverges significantly from training accuracy — you're overfitting. The article provides train/validation split code that works directly within backtest_engine.py.
Q: McNemar test, binomial test, bootstrap confidence intervals — what question does each answer?
A: Three tests answer different levels of questions. McNemar test (paired): Is the prediction difference between the debate system and single-agent baseline on the same windows statistically significant? — answers "is debate genuinely better than baseline?" Binomial test: Is the debate system's accuracy significantly above random guessing (50%)? — answers "is the system extracting non-random information?" Bootstrap confidence intervals: What is the 95% confidence interval for the debate system's accuracy? — answers "how stable is the accuracy estimate?" The three tests are complementary; none alone gives a complete answer.
Q: Are 100 backtests enough? Can backtest results be used directly for investment decisions?
A: 100 backtests are insufficient for statistically significant conclusions — the article's McNemar test yields p=0.137, not reaching the 5% significance level. To distinguish a 4-percentage-point accuracy difference, you need 500+ backtests. More importantly: all accuracy figures in this article (71.1%) come from synthetic data — they demonstrate methodology, not actual market prediction capability. The backtest framework is a tool to "rigorously test whether the system works" — not a machine to "prove the system works." On real data, if accuracy approaches 50% — face that result honestly. That's the value of backtesting.