| Appeal to Authority |
Using an authority's name in place of actual reasoning |
"Google uses microservices, so you
... [OUTPUT TRUNCATED - 4062 chars omitted out of 54062
... [OUTPUT TRUNCATED - 13 chars omitted out of 50013 total] ...
self.arguments = [
Argument(
id=item["id"],
claim=item["claim"],
reasoning=item["reasoning"],
evidence=item["evidence"]
)
for item in data
]
return self.arguments
except (json.JSONDecodeError, KeyError) as e:
# JSON parse failed — fallback to free text
print(f"Warning: {self.name} JSON parse failed ({e}), "
f"using free text fallback.")
self.arguments = [
Argument(
id=f"{prefix}-1",
claim="Opening statement (JSON parse failed, "
"see raw response)",
reasoning=reply,
evidence=""
)
]
return self.arguments
def cross_examine(
self, opponent_args: list[Argument]
) -> list[CrossExamResponse]:
"""
Round 2: Cross-Examination.
Respond to each of the opponent's opening arguments point by point.
"""
opponent_args_text = "\n\n".join(
arg.to_text() for arg in opponent_args
)
prompt = (
f"Below are your opponent's opening arguments. "
f"Respond to each one.\n\n"
f"{opponent_args_text}\n\n"
f"For each argument, output a JSON object:\n"
f'{{"target_arg_id": "opponent argument ID", '
f'"response_type": "refute|challenge|concede|partial", '
f'"reasoning": "your reasoning", '
f'"follow_up_question": "a sharp follow-up question"}}\n\n'
f"response_type meanings:\n"
f"- refute: you believe the argument has factual/logical errors\n"
f"- challenge: you believe evidence is insufficient or "
f"conditions aren't met\n"
f"- concede: you accept the argument as valid\n"
f"- partial: you accept the core but dispute degree or scope\n\n"
f"Requirements:\n"
f"1. Must respond to ALL opponent arguments — do not skip any\n"
f"2. Do NOT introduce new arguments (this is cross-examination, "
f"only respond to existing arguments)\n"
f"3. Output pure JSON array only, no other text"
)
reply = self._call_llm(prompt, temperature=0.5, max_tokens=1500)
try:
cleaned = re.sub(r'```(?:json)?\s*', '', reply).strip()
data = json.loads(cleaned)
return [
CrossExamResponse(
target_arg_id=item["target_arg_id"],
response_type=item["response_type"],
reasoning=item["reasoning"],
follow_up_question=item["follow_up_question"]
)
for item in data
]
except (json.JSONDecodeError, KeyError) as e:
print(f"Warning: {self.name} cross-exam JSON parse failed "
f"({e}), using free text.")
return [
CrossExamResponse(
target_arg_id=arg.id,
response_type="challenge",
reasoning=f"JSON parse failed. Raw response:\n{reply}",
follow_up_question="Please clarify the above."
)
for arg in opponent_args
]
def closing_statement(self) -> str:
"""
Round 3: Closing Statement.
Includes concessions made, arguments not rebutted, final position.
"""
my_args_text = "\n".join(arg.to_text() for arg in self.arguments)
prompt = (
f"Review of your opening arguments:\n{my_args_text}\n\n"
f"Deliver your closing statement. Structure as follows:\n\n"
f"## Concessions Made\n"
f"List the opponent arguments or partial arguments you accepted "
f"during cross-examination.\n\n"
f"## Arguments Not Effectively Rebutted\n"
f"Restate the core arguments from your opening that the opponent "
f"failed to effectively challenge.\n\n"
f"## Final Position\n"
f"Based on the above, what is your current overall position on "
f"the topic? If it has shifted (strengthened, weakened, "
f"partially adjusted), explain why.\n\n"
f"Requirements: total length under 200 words. Concise and sharp."
)
return self._call_llm(prompt, temperature=0.5, max_tokens=600)
# ──────────────────────────────────────────────
# 4. Structured Judge Agent (multi-dimension scoring + fallacy detection)
# ──────────────────────────────────────────────
class StructuredJudge:
"""
Judge Agent — multi-dimensional scoring, fallacy detection,
argument tracing.
Key differences from L1's JudgeAgent:
- Independently scores each argument (logic/evidence/response/honesty)
- Built-in logical fallacy detection checklist
- Generates argument trace table
- Outputs JSON-structured conclusions instead of free text
"""
FALLACY_CHECKLIST = [
("Straw Man",
"Is it distorting the opponent's argument — attacking something "
"they didn't say?"),
("Appeal to Authority",
"Is it using \"big company X uses it\" in place of actual reasoning?"),
("Slippery Slope",
"Is it assuming one action triggers an uncontrollable "
"chain reaction?"),
("False Dilemma",
"Is it reducing a complex issue to an either-or choice?"),
("Anecdotal Evidence",
"Is it using isolated cases instead of systematic evidence?"),
("Circular Reasoning",
"Is the conclusion already contained in the premise?"),
("Ad Hominem",
"Is it attacking the opponent rather than their argument?"),
]
def evaluate(
self,
topic: str,
pro_args: list[Argument],
con_args: list[Argument],
pro_cross: list[CrossExamResponse],
con_cross: list[CrossExamResponse],
pro_closing: str,
con_closing: str
) -> dict:
"""
Comprehensive evaluation of the entire debate.
Outputs structured conclusions.
"""
# Build the full evaluation request
pro_args_text = "\n\n".join(a.to_text() for a in pro_args)
con_args_text = "\n\n".join(a.to_text() for a in con_args)
pro_cross_text = "\n\n".join(r.to_text() for r in pro_cross)
con_cross_text = "\n\n".join(r.to_text() for r in con_cross)
evaluation_prompt = (
f"## Topic\n{topic}\n\n"
f"## Pro Opening Arguments\n{pro_args_text}\n\n"
f"## Con Opening Arguments\n{con_args_text}\n\n"
f"## Pro Cross-Examination of Con\n{pro_cross_text}\n\n"
f"## Con Cross-Examination of Pro\n{con_cross_text}\n\n"
f"## Pro Closing Statement\n{pro_closing}\n\n"
f"## Con Closing Statement\n{con_closing}\n\n"
)
fallacy_rules = "\n".join(
f" - {name}: {desc}"
for name, desc in self.FALLACY_CHECKLIST
)
system_prompt = (
"You are a strictly impartial debate judge. Your task is to "
"evaluate the entire debate according to the standardized "
"process below.\n\n"
"### Scoring Rules\n"
"For each opening argument (Pro's PRO-1, PRO-2... and Con's "
"CON-1, CON-2...) score on these four dimensions "
"(1-10, must be integer):\n"
"1. logic_score: Is the reasoning chain self-consistent? "
"1=full of logical gaps, 10=flawless\n"
"2. evidence_score: Is evidence specific and verifiable? "
"1=all vague generalities, 10=each piece independently verifiable\n"
"3. responsiveness_score: How well did they respond to "
"cross-examination? 1=evaded all questions, "
"10=point-by-point direct response\n"
"4. honesty_score: Did they concede when warranted? Any "
"exaggeration? 1=full of sophistry and distortion, "
"10=honest and fair\n\n"
"### Fallacy Detection\n"
"For each argument, check for the following logical fallacies. "
"If detected, list them in the fallacies array:\n"
f"{fallacy_rules}\n\n"
"### Output Format\n"
"Output strictly as the following JSON format, "
"no other text:\n"
'{\n'
' "scores": [\n'
' {\n'
' "argument_id": "PRO-1",\n'
' "logic_score": 8,\n'
' "evidence_score": 7,\n'
' "responsiveness_score": 6,\n'
' "honesty_score": 8,\n'
' "fallacies": ["empty list if none detected"],\n'
' "notes": "Brief comment on this argument"\n'
' }\n'
' ],\n'
' "argument_trace_table": [\n'
' {\n'
' "argument_id": "PRO-1",\n'
' "claim": "Summary of core claim",\n'
' "standing": "UPHELD|PARTIALLY_UPHELD|REFUTED|'
'UNCERTAIN",\n'
' "reason": "Brief explanation"\n'
' }\n'
' ],\n'
' "overall_assessment": {\n'
' "pro_total_score": 0.0,\n'
' "con_total_score": 0.0,\n'
' "key_insight": "The single most important finding from '
'this debate (1-2 sentences)",\n'
' "unresolved_questions": ["Unresolved points of contention"],\n'
' "recommendation": "Based on the debate results, what '
'specific advice for the decision-maker?"\n'
' }\n'
'}'
)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": (
f"Please evaluate the following debate.\n\n"
f"{evaluation_prompt}"
)}
],
temperature=0.2, # Very low temp for consistency & reproducibility
max_tokens=3000
)
reply = response.choices[0].message.content
try:
cleaned = re.sub(r'```(?:json)?\s*', '', reply).strip()
result = json.loads(cleaned)
return result
except json.JSONDecodeError as e:
print(f"Warning: Judge JSON parse failed ({e}), "
f"returning raw text.")
return {
"error": "JSON parse failed",
"raw_response": reply,
"scores": [],
"argument_trace_table": [],
"overall_assessment": {
"pro_total_score": 0,
"con_total_score": 0,
"key_insight": "Evaluation failed — see raw_response",
"unresolved_questions": [],
"recommendation": ""
}
}
# ──────────────────────────────────────────────
# 5. Debate Engine — orchestrates the 3-round protocol
# ──────────────────────────────────────────────
def run_structured_debate(topic: str) -> dict:
"""
Run a complete 3-round structured debate.
Returns:
dict: Contains all round records, scores, and final conclusions
"""
# ── Create Pro Agent ──
pro_agent = StructuredDebateAgent(
name="Pro",
stance="For",
system_prompt=(
f"You are a logically rigorous debater. "
f"Your position is [FOR] the following proposition:\n"
f"\"{topic}\"\n\n"
f"Core rules:\n"
f"1. All arguments must be specific, verifiable — use data "
f"and facts\n"
f"2. Each argument must contain a clear causal reasoning chain\n"
f"3. Honesty is the highest principle — when faced with "
f"unrefutable challenges, concede rather than sophize\n"
f"4. Strictly follow the format and constraints of each round"
)
)
# ── Create Con Agent ──
con_agent = StructuredDebateAgent(
name="Con",
stance="Against",
system_prompt=(
f"You are a logically rigorous debater. "
f"Your position is [AGAINST] the following proposition:\n"
f"\"{topic}\"\n\n"
f"Core rules:\n"
f"1. All arguments must be specific, verifiable — use data "
f"and facts\n"
f"2. Each argument must contain a clear causal reasoning chain\n"
f"3. Honesty is the highest principle — when faced with "
f"unrefutable challenges, concede rather than sophize\n"
f"4. Strictly follow the format and constraints of each round"
)
)
result = {"topic": topic, "rounds": {}}
print(f"\n{'=' * 60}")
print(f"Structured Debate: {topic}")
print(f"{'=' * 60}")
# ── R1: Opening Statements ──
print(f"\n{'─' * 60}")
print(f"Round 1: Opening Statements")
print(f"{'─' * 60}")
pro_args = pro_agent.opening_statement(topic)
print(f"\nPro — {len(pro_args)} arguments")
for arg in pro_args:
print(f" {arg.id}: {arg.claim[:80]}...")
con_args = con_agent.opening_statement(topic)
print(f"\nCon — {len(con_args)} arguments")
for arg in con_args:
print(f" {arg.id}: {arg.claim[:80]}...")
result["rounds"]["opening"] = {
"pro_arguments": [
{"id": a.id, "claim": a.claim,
"reasoning": a.reasoning, "evidence": a.evidence}
for a in pro_args
],
"con_arguments": [
{"id": a.id, "claim": a.claim,
"reasoning": a.reasoning, "evidence": a.evidence}
for a in con_args
]
}
# ── R2: Cross-Examination ──
print(f"\n{'─' * 60}")
print(f"Round 2: Cross-Examination")
print(f"{'─' * 60}")
pro_cross = pro_agent.cross_examine(con_args)
print(f"\nPro cross-examining Con — {len(pro_cross)} responses")
for r in pro_cross:
print(f" [{r.response_type}] -> {r.target_arg_id}")
con_cross = con_agent.cross_examine(pro_args)
print(f"\nCon cross-examining Pro — {len(con_cross)} responses")
for r in con_cross:
print(f" [{r.response_type}] -> {r.target_arg_id}")
result["rounds"]["cross_examination"] = {
"pro_cross": [
{"target": r.target_arg_id, "type": r.response_type,
"reasoning": r.reasoning,
"follow_up": r.follow_up_question}
for r in pro_cross
],
"con_cross": [
{"target": r.target_arg_id, "type": r.response_type,
"reasoning": r.reasoning,
"follow_up": r.follow_up_question}
for r in con_cross
]
}
# ── R3: Closing Statements ──
print(f"\n{'─' * 60}")
print(f"Round 3: Closing Statements")
print(f"{'─' * 60}")
pro_closing = pro_agent.closing_statement()
print(f"\nPro closing:\n{pro_closing[:200]}...")
con_closing = con_agent.closing_statement()
print(f"\nCon closing:\n{con_closing[:200]}...")
result["rounds"]["closing"] = {
"pro_closing": pro_closing,
"con_closing": con_closing
}
# ── Judge Evaluation ──
print(f"\n{'=' * 60}")
print(f"Judge Evaluation")
print(f"{'=' * 60}")
judge = StructuredJudge()
evaluation = judge.evaluate(
topic=topic,
pro_args=pro_args,
con_args=con_args,
pro_cross=pro_cross,
con_cross=con_cross,
pro_closing=pro_closing,
con_closing=con_closing
)
result["evaluation"] = evaluation
# Print score summary
if "overall_assessment" in evaluation:
oa = evaluation["overall_assessment"]
print(f"\nPro total score: {oa.get('pro_total_score', 'N/A')}")
print(f"Con total score: {oa.get('con_total_score', 'N/A')}")
print(f"\nKey insight: {oa.get('key_insight', 'N/A')}")
# Print argument trace table
if "argument_trace_table" in evaluation:
print(f"\nArgument Trace Table:")
for entry in evaluation["argument_trace_table"]:
print(f" {entry['argument_id']}: {entry['standing']} — "
f"{entry.get('claim', '')[:60]}...")
return result
# ──────────────────────────────────────────────
# 6. Helper: format validation
# ──────────────────────────────────────────────
def validate_opening_args(
args: list[Argument], expected_prefix: str
) -> list[str]:
"""
Validate opening argument format completeness.
Returns list of warnings; empty list means format is acceptable.
"""
warnings = []
for arg in args:
if not arg.id.startswith(expected_prefix):
warnings.append(
f"{arg.id}: ID prefix should be {expected_prefix}"
)
if len(arg.claim) < 10:
warnings.append(f"{arg.id}: claim too short (min 10 chars)")
if len(arg.reasoning) < 20:
warnings.append(f"{arg.id}: reasoning too short (min 20 chars)")
if len(arg.evidence) < 5:
warnings.append(f"{arg.id}: missing evidence")
return warnings
def print_briefing(result: dict):
"""Print a briefing for the human decision-maker"""
ev = result.get("evaluation", {})
oa = ev.get("overall_assessment", {})
print(f"\n{'=' * 60}")
print(f"Decision Briefing")
print(f"{'=' * 60}")
print(f"\nTopic: {result['topic']}")
print(f"\nKey Insight:\n {oa.get('key_insight', 'N/A')}")
print(f"\nRecommendation:\n {oa.get('recommendation', 'N/A')}")
unresolved = oa.get('unresolved_questions', [])
if unresolved:
print(f"\nUnresolved Questions:")
for q in unresolved:
print(f" - {q}")
# ──────────────────────────────────────────────
# 7. Run the example
# ──────────────────────────────────────────────
if __name__ == "__main__":
result = run_structured_debate(
topic="Should a small startup (under 10 people) "
"adopt microservices architecture from day one?"
)
# Print decision briefing
print_briefing(result)
# Save results
with open("/tmp/structured_debate_result.json",
"w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print("\nFull debate record saved to "
"/tmp/structured_debate_result.json")
Code Structure Breakdown
Compared to L1's debate.py (~180 lines, 3 classes), the L2 code is heavier — but that weight comes from structure and auditability, not pointless complexity:
| Component |
L1 Equivalent |
New Capability in L2 |
StructuredDebateAgent |
DebateAgent |
Round-aware: opening_statement() / cross_examine() / closing_statement() as separate methods; arguments structured as Argument objects; JSON output for machine readability |
StructuredJudge |
JudgeAgent |
Multi-dimensional scoring (logic/evidence/responsiveness/honesty + weighted); built-in 7-type fallacy detection; argument trace table; JSON structured output |
RoundType |
(none) |
Enumeration of 3 round types; engine schedules by round |
Argument / CrossExamResponse / ScoringResult |
(free text) |
Structured dataclasses, strongly typed I/O for debate |
validate_opening_args() |
(none) |
Format validation function for argument quality floor |
💡 Running tip: Replace your-api-key and api.example.com with your actual API credentials. Each 3-round structured debate run triggers about 8 LLM calls (3 rounds each for both sides + judge evaluation + possible format corrections), so budget your API quota accordingly.
Extracting Actionable Information from Results
Code runs, you get a JSON result. But how do you read it? Here are three levels of reading.
Level 1: Total Scores
overall_assessment.pro_total_score and con_total_score give you a quantitative comparison of debate quality. But don't just look at who's higher — a gap under 1 point means the sides are evenly matched; a gap over 3 points indicates a significant difference.
Level 2: Argument Trace Table
This is the most practical part. argument_trace_table tells you the final status of each opening argument:
- UPHELD: This argument survived cross-examination. It's information you can rely on.
- PARTIALLY_UPHELD: The core direction is correct, but is limited in conditions, degree, or scope — note these limitations before using.
- REFUTED: This argument was exposed as fundamentally flawed during cross-examination. Don't use it as a decision basis.
- UNCERTAIN: Neither side reached a clear conclusion on this. More data or further analysis is needed.
Level 3: Unresolved Questions
unresolved_questions lists the issues left hanging by the debate. These are information gaps you must verify yourself before making a decision. AI debate can't do everything for you — but it can precisely locate what you still need to do.
⚠️ Don't blindly trust the scores: The judge is also an LLM and may have its own biases. Scores and trace tables are decision aids, not final verdicts. For truly critical decisions, you should read the debate transcript yourself and apply your own judgment. The AI debate system's role is to improve the organization and coverage of information — not to replace human judgment.
Protocol Limitations (An Honest Assessment)
No protocol is perfect. Here are the known limitations of this 3-round framework:
- Sensitive to model capability: When both agents use the same model, they share the same knowledge boundary and reasoning patterns. Two GPT-4o instances debating still can't see what GPT-4o doesn't know. The solution is to use different models for each agent (e.g., GPT-4o vs Claude), but that's not yet implemented in this article's code — saved for a future article.
- JSON parsing is fragile: LLM JSON output occasionally has errors (extra comma, missing quote). We've added graceful fallback to free text, but in production you may want more robust parsing (Schema-constrained generation or multiple retries).
- Cross-examination may be "surface-level": The opponent raises a challenge, Pro responds — but the judge may not assess whether the response is truly effective. The judge can only evaluate the surface quality of the response (directness, logical consistency), not factual accuracy.
- No external verification: The entire debate happens inside the LLM's "mind." If both sides cite a non-existent study, the judge can't detect it. Future articles will introduce RAG and tool calling to address this.
Key Takeaways
- Structure = reliability: Free-form debate easily falls into topic drift, false consensus, and shallow depth. The 3-round protocol (Opening → Cross-Exam → Closing) solves these with structural constraints.
- Cross-examination is the core of debate: Round 2's cross-examination is the most critical phase — it forces both sides to drill deep into each other's reasoning chains, exposing logical flaws and evidence gaps.
- Judges need a ruler, not a feeling: A multi-dimensional scoring rubric (logic/evidence/responsiveness/honesty) is more reliable and reproducible than a vague "who won."
- The argument trace table is the decision-maker's map: It compresses a lengthy debate into "which arguments stood and which were refuted" — the crucial bridge from debate to decision.
- AI debate is a decision aid, not a decision-maker: The judge's scores and trace tables are input for the human decision-maker — not a replacement.
📎 Series note: This is article 2 of the Multi-Agent Debate series. The previous article L1: Why Debate Beats a Single Answer introduced cognitive biases and adversarial collaboration fundamentals. Recommended reading order.
📖 Next: Debate Scoring & Consensus — score calibration, multi-judge systems, weighted voting, consensus metrics
Frequently Asked Questions
Q: Why is 3 rounds better than more or fewer rounds?A: 1 round equals no debate — each side speaks once with no interaction. Beyond 5 rounds, Agents start repeating arguments (studies show novel arguments drop below 15% from round 4 onward). 3 rounds is the optimal balance: enough adversarial engagement and response space without infinite looping. Each round has a clear goal — present, probe, conclude.
Q: How should judge scoring dimensions be designed?A: The four dimensions have a hierarchy: Evidence Quality (40% weight, most important — does the argument have data backing) > Logical Coherence (30%, is the reasoning self-consistent) > Rebuttal Effectiveness (20%, does it precisely address the opponent) > Clarity (10%, is expression clear). Core principle: scoring dimensions must correlate with predictive accuracy, otherwise they're meaningless.
Q: Why is cross-examination more effective than open discussion?A: In open discussion, Agents tend to "talk past each other" — each restating their own arguments without genuinely responding. Cross-examination forces each Agent to confront the other's strongest points and respond point by point. This suppresses confirmation bias — the Agent must consider why the opponent might be right, not just why it is right.
Q: What is the Argument Trace Table for?A: The Argument Trace Table records who proposed what argument in each round, how the opponent responded, and whose view was ultimately adopted. It makes the debate process auditable — you can trace the judge's reasoning, identify logical gaps or scoring biases. In production, the trace table also forms the basis of the analysis report users ultimately see.
Q: Does debate quality vary significantly with different LLM models?A: Yes. Testing shows 5-8 percentage point accuracy differences between GPT-4 and Claude in structured debate. Stronger models aren't just "smarter" — they're better at understanding protocol rules, following structured output formats, and detecting subtle logical flaws in opponent arguments. For critical decision-making scenarios, use the best model within your capabilities.
|