life-tooling/scripts/extract-companion-transcript.py
2026-03-20 09:32:40 -07:00

261 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""Extract human-readable companion session transcripts from Claude Code JSONL logs.
Usage:
python3 scripts/extract-companion-transcript.py [--since YYYY-MM-DD] [--output FILE]
Finds companion/copilot sessions, extracts user + assistant text messages
(skipping tool calls, tool results, thinking blocks, system messages),
and produces a merged chronological transcript.
"""
import json
import os
import sys
import argparse
from datetime import datetime
from pathlib import Path
JSONL_DIR = Path.home() / ".claude/projects/-var-home-lilith-Code--projects--life-life-manager"
COMPANION_USER_MARKERS = [
"/companion",
"you are my companion",
"companion copilot",
"companion session",
"boot companion",
]
COMPANION_ASSISTANT_MARKERS = [
"companion mode",
"companion session bootstrap",
"copilot session",
"meds due",
"footer rotation",
]
def is_companion_session(filepath: Path) -> tuple[bool, str | None]:
"""Check if a JSONL file is a companion session.
Only matches sessions where the USER explicitly invoked companion mode
(not just sessions where 'companion' appears in system context).
"""
first_timestamp = None
user_messages_checked = 0
try:
with open(filepath) as f:
for i, line in enumerate(f):
if i > 60:
break
entry = json.loads(line)
if not first_timestamp and entry.get("timestamp"):
first_timestamp = entry["timestamp"]
msg = entry.get("message", {})
content = msg.get("content", "")
if entry.get("type") == "user":
# Only check actual user-typed messages (strings), not tool results
if isinstance(content, str) and content.strip():
text = content.lower().strip()
# Skip system reminder injections
if "<system-reminder>" in text:
continue
# Skip plan implementation sessions — these aren't conversations
if text.startswith("implement the following plan"):
return False, first_timestamp
# Skip local command caveats
if "<local-command-caveat>" in text:
continue
user_messages_checked += 1
if any(marker in text for marker in COMPANION_USER_MARKERS):
return True, first_timestamp
# Check for /companion slash command invocation
if "<command-name>/companion</command-name>" in content or "<command-message>companion</command-message>" in content:
return True, first_timestamp
# Only check first 5 real user messages
if user_messages_checked >= 5:
break
elif entry.get("type") == "assistant":
# Check if assistant is clearly in companion mode
if isinstance(content, list):
text = " ".join(
block.get("text", "")
for block in content
if isinstance(block, dict) and block.get("type") == "text"
).lower()
if any(marker in text for marker in COMPANION_ASSISTANT_MARKERS):
return True, first_timestamp
except (json.JSONDecodeError, OSError):
pass
return False, first_timestamp
def extract_text_content(content) -> str:
"""Extract only text content from a message, skipping tool calls/results/thinking."""
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
texts = []
for block in content:
if not isinstance(block, dict):
continue
if block.get("type") == "text":
text = block.get("text", "").strip()
if text:
texts.append(text)
return "\n".join(texts)
return ""
def is_system_reminder(text: str) -> bool:
"""Check if text is a system reminder injection (not real user input)."""
markers = [
"<system-reminder>",
"ACTIVE PROTOCOLS",
"COLLECTIVE VOICE",
"PARALLEL EXECUTION",
"ANTI-HALLUCINATION",
"SAFETY TRIGGERS",
"COMPLETE CODE",
"UNUSED VARIABLE",
"INSTRUCTION ROUTER",
"UserPromptSubmit hook",
"The task tools haven't been used",
"Tool loaded.",
]
return any(marker in text for marker in markers)
def extract_transcript(filepath: Path) -> list[dict]:
"""Extract user/assistant text messages from a JSONL session file."""
messages = []
with open(filepath) as f:
for line in f:
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
entry_type = entry.get("type")
if entry_type not in ("user", "assistant"):
continue
msg = entry.get("message", {})
content = msg.get("content", "")
text = extract_text_content(content)
if not text:
continue
# Skip system reminders injected as user messages
if entry_type == "user" and is_system_reminder(text):
continue
# Skip tool results (user messages that are just tool outputs)
if entry_type == "user" and isinstance(content, list):
has_tool_result = any(
isinstance(b, dict) and b.get("type") == "tool_result"
for b in content
)
if has_tool_result and not text:
continue
timestamp = entry.get("timestamp", "")
messages.append({
"role": "user" if entry_type == "user" else "assistant",
"text": text,
"timestamp": timestamp,
})
return messages
def format_timestamp(ts: str) -> str:
"""Format ISO timestamp to readable form."""
try:
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
return dt.strftime("%Y-%m-%d %I:%M %p")
except (ValueError, AttributeError):
return ts[:19] if ts else "?"
def main():
parser = argparse.ArgumentParser(description="Extract companion session transcripts")
parser.add_argument("--since", help="Only include sessions after this date (YYYY-MM-DD)", default="2026-03-01")
parser.add_argument("--output", "-o", help="Output file path", default=None)
args = parser.parse_args()
since_date = args.since
sessions = []
print(f"Scanning {JSONL_DIR} for companion sessions since {since_date}...", file=sys.stderr)
for filepath in sorted(JSONL_DIR.glob("*.jsonl")):
stat = filepath.stat()
file_date = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d")
if file_date < since_date:
continue
is_companion, first_ts = is_companion_session(filepath)
if is_companion:
sessions.append((filepath, first_ts or file_date))
print(f" Found: {filepath.name} ({format_timestamp(first_ts or '')})", file=sys.stderr)
if not sessions:
print("No companion sessions found.", file=sys.stderr)
sys.exit(0)
sessions.sort(key=lambda x: x[1])
print(f"\nFound {len(sessions)} companion session(s).\n", file=sys.stderr)
output_lines = []
output_lines.append("# Combined Companion Session Transcript")
output_lines.append(f"# Generated: {datetime.now().strftime('%Y-%m-%d %I:%M %p')}")
output_lines.append(f"# Sessions: {len(sessions)} (since {since_date})")
output_lines.append("")
for filepath, first_ts in sessions:
session_id = filepath.stem
output_lines.append(f"---")
output_lines.append(f"## Session: {session_id}")
output_lines.append(f"**Started**: {format_timestamp(first_ts)}")
output_lines.append("")
messages = extract_transcript(filepath)
if not messages:
output_lines.append("*(No text messages extracted)*")
output_lines.append("")
continue
for msg in messages:
ts = format_timestamp(msg["timestamp"])
role = "**Vicky**" if msg["role"] == "user" else "**Assistant**"
text = msg["text"]
# Truncate very long assistant messages (handoff content etc)
if msg["role"] == "assistant" and len(text) > 2000:
text = text[:500] + "\n\n*[... truncated — full content in handoff/file ...]*"
output_lines.append(f"[{ts}] {role}: {text}")
output_lines.append("")
result = "\n".join(output_lines)
if args.output:
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(result)
print(f"Written to {args.output}", file=sys.stderr)
else:
print(result)
if __name__ == "__main__":
main()