| Cross-examination |
bool:
"""Check if daily budget would be exceeded"""
today = datetime.now(timezone.utc).date()
if today != self._budget_reset_date:
self._daily_spend = 0.0
self._budget_reset_date = today
return (self._daily_spend + estimated_cost) <= self.daily_budget
def create_session(
self,
topic: str,
mode: DebateMode = DebateMode.CONSENSUS,
pro_model: str = None,
con_model: str = None,
judge_models: list = None,
timeout_seconds: int = 300,
) -> DebateSession:
"""Create a new debate session"""
return DebateSession(
session_id=str(uuid.uuid4())[:8],
topic=topic,
mode=mode,
pro_model=pro_model or self.default_model,
con_model=con_model or self.default_model,
judge_models=judge_models or [self.default_model],
created_at=datetime.now(timezone.utc).isoformat(),
timeout_seconds=timeout_seconds,
)
async def run_debate(
self, session: DebateSession
) -> DebateSession:
"""
Execute a full debate.
Includes timeout protection, error recovery, and budget check.
"""
start_time = time.time()
# Budget check
est_cost = self._estimate_session_cost(session)
if not self._check_budget(est_cost):
session.status = SessionStatus.FAILED
session.error = (
f"Exceeded daily budget (${self.daily_budget}). "
f"Est. cost ${est_cost:.4f} + spent ${self._daily_spend:.4f}"
)
await self.store.save_session(session)
return session
session.status = SessionStatus.DEBATING
await self.store.save_session(session)
await self.store.log_event(
session.session_id, EventType.DEBATE_STARTED,
data={"mode": session.mode.value, "topic": session.topic}
)
try:
result = await asyncio.wait_for(
self._execute(session),
timeout=session.timeout_seconds,
)
session.result = result
session.status = SessionStatus.COMPLETED
except asyncio.TimeoutError:
session.status = SessionStatus.TIMED_OUT
session.error = f"Debate timed out ({session.timeout_seconds}s)"
except Exception as e:
if session.retry_count < session.max_retries:
session.retry_count += 1
session.error = str(e)
await self.store.save_session(session)
await self.store.log_event(
session.session_id, EventType.RETRY,
data={"retry": session.retry_count,
"error": str(e)}
)
return await self.run_debate(session)
session.status = SessionStatus.FAILED
session.error = str(e)
finally:
session.completed_at = datetime.now(timezone.utc).isoformat()
session.elapsed_seconds = round(time.time() - start_time, 2)
total_cost = self.tracker.total_cost(session.costs)
self._daily_spend += total_cost
await self.store.save_session(session)
final_event = (
EventType.DEBATE_COMPLETED
if session.status == SessionStatus.COMPLETED
else EventType.DEBATE_FAILED
)
await self.store.log_event(
session.session_id, final_event,
data={
"status": session.status.value,
"elapsed": session.elapsed_seconds,
"cost": total_cost,
"retries": session.retry_count,
}
)
return session
async def _execute(self, session: DebateSession) -> Dict:
"""
Core execution — dispatches to mode-specific runner.
In production, this calls actual L1-L3 functions.
"""
methods = {
DebateMode.SIMPLE: self._run_simple,
DebateMode.STRUCTURED: self._run_structured,
DebateMode.CONSENSUS: self._run_consensus,
}
runner = methods.get(session.mode, self._run_consensus)
return await runner(session)
async def _run_simple(self, session: DebateSession) -> Dict:
"""L1: Free-form debate"""
# Production code:
# result = run_debate(
# topic=session.topic, rounds=3,
# pro_model=session.pro_model,
# con_model=session.con_model
# )
await self.store.log_event(
session.session_id, EventType.ROUND_START,
round_number=1
)
return {
"mode": "simple", "topic": session.topic,
"result": "L1 simple debate result placeholder",
"rounds_completed": 3,
}
async def _run_structured(self, session: DebateSession) -> Dict:
"""L2: Structured debate + single judge"""
# Production code:
# result = run_structured_debate(
# topic=session.topic,
# pro_model=session.pro_model,
# con_model=session.con_model,
# judge_model=session.judge_models[0]
# )
session.status = SessionStatus.JUDGING
await self.store.save_session(session)
return {
"mode": "structured", "topic": session.topic,
"result": "L2 structured debate result placeholder",
"trace_table": [],
}
async def _run_consensus(self, session: DebateSession) -> Dict:
"""L3: Multi-judge consensus debate"""
# Production code:
# pro_args = [...]
# con_args = [...]
# panel = MultiJudgePanel([
# JudgeProfile(name="Technical Judge",
# domain=ExpertiseDomain.TECHNICAL),
# JudgeProfile(name="Business Judge",
# domain=ExpertiseDomain.BUSINESS),
# JudgeProfile(name="Risk Judge",
# domain=ExpertiseDomain.RISK),
# JudgeProfile(name="General Judge",
# domain=ExpertiseDomain.GENERAL),
# ])
# result: PanelResult = panel.evaluate(
# topic=session.topic,
# pro_args=pro_args, con_args=con_args,
# pro_cross_text=..., con_cross_text=...,
# pro_closing=..., con_closing=...
# )
# return {
# "mode": "consensus",
# "alpha": result.alpha,
# "kappa": result.kappa,
# "weighted_pro": result.weighted_result["pro"],
# "weighted_con": result.weighted_result["con"],
# "irreconcilable": result.divergence["irreconcilable"],
# "recommendation": result.divergence["recommendation"],
# }
session.status = SessionStatus.JUDGING
await self.store.save_session(session)
return {
"mode": "consensus", "topic": session.topic,
"result": "L3 consensus debate result placeholder",
"alpha": 0.78, "kappa": 0.72,
"irreconcilable": False,
}
def _estimate_session_cost(self, session: DebateSession) -> float:
"""Estimate cost for a single debate session"""
base_tokens = {
DebateMode.SIMPLE: 8_000,
DebateMode.STRUCTURED: 25_000,
DebateMode.CONSENSUS: 60_000,
}
tokens = base_tokens.get(session.mode, 60_000)
model = session.pro_model
in_price, out_price = CostTracker.PRICING.get(
model, (5.0, 15.0)
)
# Rough estimate: 60% input, 40% output
return round(
(tokens * 0.6 / 1_000_000) * in_price +
(tokens * 0.4 / 1_000_000) * out_price, 4
)
# ── REST-ish API Methods ──
async def api_create_debate(
self, topic: str, mode: str = "consensus"
) -> Dict:
"""Create and asynchronously start a debate"""
debate_mode = DebateMode(mode)
session = self.create_session(topic=topic, mode=debate_mode)
await self.store.save_session(session)
# Execute in background
task = asyncio.create_task(self.run_debate(session))
self.active_debates[session.session_id] = task
task.add_done_callback(
lambda t: self.active_debates.pop(
session.session_id, None
)
)
return {
"session_id": session.session_id,
"status": "accepted",
"mode": session.mode.value,
"topic": session.topic,
"created_at": session.created_at,
"poll_url": f"/debates/{session.session_id}",
}
async def api_get_result(self, session_id: str) -> Dict:
"""Query debate status/result"""
data = await self.store.get_session(session_id)
if not data:
return {"error": "Session not found"}
return data
async def api_estimate_cost(
self, topic: str, mode: str = "consensus"
) -> Dict:
"""Estimate cost without executing the debate"""
session = self.create_session(
topic=topic, mode=DebateMode(mode)
)
cost = self._estimate_session_cost(session)
return {
"topic": topic, "mode": mode,
"estimated_cost_usd": cost,
"daily_budget_remaining": round(
self.daily_budget - self._daily_spend, 4
),
}
async def api_get_metrics(self) -> Dict:
"""Get monitoring metrics"""
sessions = await self.store.list_sessions(limit=200)
total = len(sessions)
if total == 0:
return {"total_sessions": 0}
completed = sum(
1 for s in sessions
if s["status"] == "completed"
)
failed = sum(
1 for s in sessions
if s["status"] == "failed"
)
timed_out = sum(
1 for s in sessions
if s["status"] == "timed_out"
)
times = [
s["elapsed_seconds"] for s in sessions
if s["elapsed_seconds"] and s["elapsed_seconds"] > 0
]
avg_time = sum(times) / len(times) if times else 0
sorted_times = sorted(times) if times else [0]
return {
"total_sessions": total,
"completed": completed,
"failed": failed,
"timed_out": timed_out,
"completion_rate_pct": round(
completed / total * 100, 1
),
"avg_duration_seconds": round(avg_time, 1),
"p95_duration_seconds": round(
sorted_times[
int(len(sorted_times) * 0.95)
] if len(sorted_times) >= 20
else sorted_times[-1] if sorted_times else 0,
1,
),
"active_debates": len(self.active_debates),
"daily_spend_usd": round(self._daily_spend, 4),
"budget_remaining": round(
self.daily_budget - self._daily_spend, 4
),
}
async def api_health(self) -> Dict:
"""Health check"""
return {
"status": "healthy",
"active_debates": len(self.active_debates),
"daily_spend": round(self._daily_spend, 4),
}
# ══════════════════════════════════════════════
# 6. Quick Start Demo
# ══════════════════════════════════════════════
async def quick_start_demo():
"""Demonstrate how to launch and use the orchestrator"""
orch = DebateOrchestrator(
db_path="debate_sessions.db",
default_model="gpt-4o",
daily_budget_usd=10.0,
)
await orch.start()
print("✅ Orchestrator started\n")
# ── 1. Estimate cost ──
est = await orch.api_estimate_cost(
"Should a startup adopt microservices from day one?",
mode="consensus"
)
print(f"💰 Cost estimate: ${est['estimated_cost_usd']}")
print(f" Daily budget remaining: ${est['daily_budget_remaining']}\n")
# ── 2. Create and launch debate ──
debate = await orch.api_create_debate(
topic="Should a startup adopt microservices from day one?",
mode="consensus"
)
print(f"🚀 Debate launched: {debate['session_id']} ({debate['mode']})")
# ── 3. Poll for results ──
for i in range(10):
await asyncio.sleep(3)
result = await orch.api_get_result(debate["session_id"])
status = result.get("status", "unknown")
print(f" [{i+1}] Status: {status}")
if status in ("completed", "failed", "timed_out"):
print(f" Elapsed: {result.get('elapsed_seconds', 0)}s")
if "error" in result and result["error"]:
print(f" Error: {result['error']}")
break
# ── 4. View metrics ──
metrics = await orch.api_get_metrics()
print(f"\n📊 System Metrics:")
print(f" Total debates: {metrics['total_sessions']}")
print(f" Success rate: {metrics['completion_rate_pct']}%")
print(f" Avg duration: {metrics['avg_duration_seconds']}s")
print(f" Daily spend: ${metrics['daily_spend_usd']}")
# ── 5. Health check ──
health = await orch.api_health()
print(f"Heartbeat: {health['status']}")
if __name__ == "__main__":
print("=" * 60)
print("Multi-Agent Debate System — Production Orchestrator")
print("=" * 60)
print()
print("To run the quick-start demo (requires valid LLM API credentials):")
print(" asyncio.run(quick_start_demo())")
print()
print("To deploy as a web service, wrap DebateOrchestrator methods")
print("in FastAPI/Flask routes. Example:")
print()
print(" from fastapi import FastAPI")
print(" app = FastAPI()")
print(" orch = DebateOrchestrator()")
print()
print(" @app.post('/debates')")
print(" async def create(topic: str, mode: str = 'consensus'):")
print(" return await orch.api_create_debate(topic, mode)")
print()
print(" @app.get('/debates/{session_id}')")
print(" async def get_result(session_id: str):")
print(" return await orch.api_get_result(session_id)")
print("=" * 60)
Code Structure Breakdown
| Component |
Function |
Key Methods |
DebateSession |
Debate session data model — full lifecycle state |
Fields: session_id, topic, mode, status, costs, result, error |
SessionStore |
SQLite persistence + audit log |
init() / save_session() / log_event() |
CostTracker |
Multi-model pricing table + token counting + cost estimation |
record_call() / total_cost() |
with_retry() |
Exponential backoff retry with error-type differentiation |
Distinguishes transient, content filter, and auth errors |
DebateOrchestrator |
Core orchestrator — session lifecycle + L1-L3 integration + REST API |
run_debate() / api_create_debate() / api_get_metrics() |
💡 Key difference from prototype to production: Notice the three methods — _run_simple(), _run_structured(), _run_consensus() — they currently return placeholder data. For production deployment, simply uncomment the imports and calls to wire in the full L1-L3 logic. The orchestrator layer (timeout, retry, logging, state management) is fully decoupled from the debate logic (L1-L3).
Deployment Patterns
Pattern 1: Single-Machine (Starter)
All agents and judges use the same model (e.g., GPT-4o), running on a single server. Simplest — suitable for internal team decision-support tools.
- Pros: Zero ops complexity, cost-controllable (one API key), predictable latency.
- Cons: Model blind spots are amplified — if that model has a bias in a domain, all agents and judges will exhibit the same bias.
- Fits: Internal tools, non-critical decisions, < 50 debates/day.
Pattern 2: Multi-Model (Recommended)
Different roles use different model providers:
| Role |
Recommended Model |
Reason |
| Pro Agent |
Claude 3.5 Sonnet |
Excels at building structured arguments with clear logic |
| Con Agent |
GPT-4o |
Excels at identifying flaws and raising counterexamples |
| Technical Judge |
Claude 3.5 Sonnet |
More precise on technical detail evaluation |
| Business Judge |
GPT-4o |
Stronger on business reasoning and data analysis |
| Risk Judge |
Gemini 2.0 |
Provides a different risk perspective, reducing homogeneous judgment |
Multi-model deployment's core value isn't "pick the best model for everything" — it's using model diversity to reduce systematic bias — the same principle as L3's differentiated multi-judge design.
Pattern 3: Human-in-the-Loop (Hybrid)
For critical decisions (budget > $100k, legal/compliance implications, affecting many users), the debate system shouldn't auto-output the final conclusion. It should:
- Complete L3-level debate and consensus evaluation.
- If Alpha ≥ 0.80: auto-generate a decision recommendation, marked as "high confidence."
- If Alpha < 0.67 or irreconcilable divergence triggered: pause the pipeline, push the most divergent arguments and judge commentary to a human decision-maker.
- The human decision-maker makes the final judgment based on the AI-provided structured divergence summary — but what they see isn't the raw debate transcript; it's a divergence heatmap already curated by the AI judge panel.
⚠️ Human-in-the-loop trap: Don't treat the human decision-maker as the "final judge" — this creates the illusion of "I decide in the end anyway, the AI analysis is just reference," leading to insufficient review. The right approach: the human reviews only what the AI couldn't reach consensus on, not what the AI already agreed on.
Key Insight: The Debate System Is an Information-Processing Pipeline
If you take away one core understanding from this article, let it be:
A production-grade debate system is not code — it's an information-processing pipeline where every stage must be observable, fault-tolerant, and cost-controlled.
Specifically:
- Observable: Every debate round, every LLM call, every judge score has a timestamp and audit record. You can trace exactly how any decision was made. When someone asks "why did the AI reach this conclusion," you don't need to say "the model said so" — you can show them the complete debate transcript and judge scorecards.
- Fault-tolerant: LLM calls will fail, time out, and return malformed content. Every stage of the pipeline has independent error handling — not "the entire debate failed," but "one argument in this round was degraded."
- Cost-controlled: Not every question needs L3-level consensus debate. Debate mode tiering + daily budget cap + model sharding ensure you get high-quality decisions without going bankrupt.
When you get these three right, the debate system transforms from "an interesting AI experiment" into "decision infrastructure an organization can depend on."
Series Retrospective
This is the fourth and final article in the Multi-Agent Debate series. A look back at the journey:
| Article |
Title |
Core Contribution |
Output |
| L1 |
Why Debate Beats a Single Answer |
Revealed single-model cognitive biases (confirmation bias, anchoring, overconfidence), proved value of adversarial collaboration |
debate.py — dual-agent free-form debate |
| L2 |
Structured Debate Protocol |
Designed 3-round debate protocol (Opening → Cross-Exam → Closing), introduced multi-dimensional scoring and argument trace table |
debate_protocol.py — structured debate + judge agent |
| L3 |
Debate Scoring & Consensus |
Multi-judge expert panel, score calibration, weighted voting, Krippendorff Alpha + Fleiss Kappa consensus metrics |
debate_consensus.py — multi-judge consensus system |
| L4 |
Production Deployment (this article) |
Wrapped L1-L3 into a deployable production service: async orchestration, session store, error recovery, cost control, monitoring |
debate_orchestrator.py — production orchestrator |
From L1 to L4: An Arc of Thinking
Looking back, this series follows a natural progression:
- L1 asked "why": Why do I need debate? What's wrong with a single model? — Establishing the problem's necessity.
- L2 asked "how": What structure does good debate need? How do you score fairly? — Designing the solution.
- L3 asked "what if the judge is wrong too": How do you ensure the judge's judge is reliable? — Self-questioning the solution.
- L4 asked "how do we actually use it": How do you go from script to service? How do you control cost and risk? — Turning the solution into infrastructure.
This "why → how → self-question → land" arc applies not just to debate systems — it applies to any journey from AI prototype to AI product.
Open Questions
Even after four articles, we still have important unsolved problems — they're beyond this series' scope but worth pondering in your own practice:
- Automatic topic discovery: Currently, topics are human-provided. A truly autonomous debate system should automatically identify "contentious issues worth debating" from data streams. This requires combining anomaly detection and controversy mining.
- Cross-debate knowledge accumulation: Each debate is siloed. But "microservices operational cost" comes up repeatedly across debates — the system should accumulate knowledge across sessions, forming a "controversy knowledge graph."
- Debate strategy evolution: Currently, Pro and Con have fixed prompts. But if Pro always loses on the same argument (e.g., "operational cost"), the system should automatically adjust Pro's strategy on that point.
- Real-time debate intervention: In streaming mode, a human observer could inject new evidence or questions mid-debate. This requires designing an elegant "human intervention protocol."
Key Takeaways
- Debate systems are productionizable: With an async orchestrator, session store, error recovery, and cost control, L1-L3's debate capabilities can be packaged into a reliable production service for daily team use.
- Observability is the foundation of trust: When you can precisely trace "why the system reached this conclusion" — not "the model said" but "Judges A, B, and C evaluated which arguments how" — the debate system transforms from a black box into a trusted decision tool.
- Budget control is not optional: In production, LLM costs are real and ongoing. Three-layer cost control — topic tiering, daily budget caps, and model sharding — lets you improve decision quality without losing control of costs.
- Deployment pattern determines system quality: Single-model deployment is simple but introduces systematic bias; multi-model deployment improves robustness through diversity; human-in-the-loop preserves human final judgment for critical decisions.
- A debate system is an information-processing pipeline: Carve this into the first page of your project docs — it reminds your team that you're building not just another LLM app, but a complex information-processing system where every stage must be monitored, fault-tolerant, and cost-managed.
Frequently Asked Questions
Why does a debate system need production orchestration?
A research notebook proves the concept; a production orchestrator proves reliability. Without an orchestrator handling retries, sessions, timeouts, and logging, every debate failure becomes a manual investigation. Production orchestration turns "it usually works" into "when it breaks, we know exactly where and why."
When should a team use L1, L2, or L3 debate mode?
Use L1 for quick adversarial checks — lightweight, fast, good for sanity-testing ideas. Use L2 when you need a structured protocol with explicit scores — team decision-making, technical evaluations. Use L3 for high-stakes decisions where even the scoring itself needs validation — investment analysis, architecture decisions with long-term consequences. Most teams should start with L2; graduate to L3 when the cost of a wrong decision justifies the extra compute.
How should cost control be handled in a multi-agent debate service?
Three layers: (1) Topic tiering — classify each debate topic as low/medium/high stakes and assign model quality accordingly. (2) Daily budget caps — hard limits that trigger alerts, not automatic shutdowns. (3) Model sharding — use cheaper models for rote tasks (summarization, extraction) and expensive models only for the highest-leverage reasoning steps. With these three layers, a team of 10 running 5 debates daily can stay under $50/month.
Why are audit logs important for debate systems?
Audit logs are the difference between "we debated and reached a conclusion" and "we can prove why." When a decision is questioned months later — and important decisions always are — logs let you trace every argument, every score, every judge's reasoning. They're also essential for debugging: without logs, a debate that produced a strange result is just a mystery. With logs, it's a teachable moment.
Next Steps
📎 Series note: This is the final article (4 of 4) in the Multi-Agent Debate series. Recommended reading order: L1: Adversarial Collaboration Intro → L2: Structured Debate Protocol → L3: Debate Scoring & Consensus → This article (L4).
🏁 Series complete. Return to AI Agent Exploration for more articles.
|