feat(01-03): add timing profiler for response latency debugging

Instruments the full message pipeline with [TIMING] log entries:
- Message age (Telegram delivery delay)
- Subprocess state (cold-start vs reused)
- Process spawn, first stdout, first assistant text
- Telegram send latency, total wall time

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Mikkel Georgsen 2026-02-04 18:30:05 +00:00
parent 2302b75e14
commit d878931e79
2 changed files with 35 additions and 9 deletions

View file

@ -8,6 +8,7 @@ import asyncio
import logging import logging
import os import os
import subprocess import subprocess
import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@ -66,10 +67,13 @@ def is_authorized(user_id: int) -> bool:
def make_callbacks(bot, chat_id): def make_callbacks(bot, chat_id):
"""Create callbacks for ClaudeSubprocess bound to specific chat.""" """Create callbacks for ClaudeSubprocess bound to specific chat."""
async def on_output(text): async def on_output(text):
t0 = time.monotonic()
# Truncate to Telegram limit # Truncate to Telegram limit
if len(text) > 4000: if len(text) > 4000:
text = text[:4000] + "\n... (truncated)" text = text[:4000] + "\n... (truncated)"
await bot.send_message(chat_id=chat_id, text=text) await bot.send_message(chat_id=chat_id, text=text)
elapsed = time.monotonic() - t0
logger.info(f"[TIMING] Telegram send: {elapsed:.3f}s ({len(text)} chars)")
async def on_error(error): async def on_error(error):
await bot.send_message(chat_id=chat_id, text=f"Error: {error}") await bot.send_message(chat_id=chat_id, text=f"Error: {error}")
@ -454,11 +458,15 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
return return
message = update.message.text message = update.message.text
logger.info(f"Routing message to session '{active_session}': {message[:50]}...") t_start = time.monotonic()
# Telegram message timestamp vs now = delivery delay
msg_age = (datetime.utcnow() - update.message.date.replace(tzinfo=None)).total_seconds()
logger.info(f"[TIMING] Message received: session='{active_session}', age={msg_age:.1f}s, text={message[:50]}...")
try: try:
# Get or create subprocess for active session # Get or create subprocess for active session
if active_session not in subprocesses or not subprocesses[active_session].is_alive: already_alive = active_session in subprocesses and subprocesses[active_session].is_alive
if not already_alive:
session_dir = session_manager.get_session_dir(active_session) session_dir = session_manager.get_session_dir(active_session)
session_data = session_manager.get_session(active_session) session_data = session_manager.get_session(active_session)
persona_name = session_data.get('persona', 'default') persona_name = session_data.get('persona', 'default')
@ -477,7 +485,8 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
on_status=callbacks[3], on_status=callbacks[3],
) )
logger.info(f"Auto-spawned subprocess for active session '{active_session}'") subprocess_state = "reused" if already_alive else "cold-start"
logger.info(f"[TIMING] Subprocess: {subprocess_state}, busy={subprocesses[active_session].is_busy}")
# Send message to Claude # Send message to Claude
await subprocesses[active_session].send_message(message) await subprocesses[active_session].send_message(message)

View file

@ -25,6 +25,7 @@ import inspect
import json import json
import logging import logging
import os import os
import time
from pathlib import Path from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
@ -139,6 +140,9 @@ class ClaudeSubprocess:
Args: Args:
message: Initial message to send message: Initial message to send
""" """
self._spawn_time = time.monotonic()
self._first_output_time = None
# Build command # Build command
cmd = [ cmd = [
"claude", "claude",
@ -179,9 +183,11 @@ class ClaudeSubprocess:
# Ensure session directory exists # Ensure session directory exists
self._session_dir.mkdir(parents=True, exist_ok=True) self._session_dir.mkdir(parents=True, exist_ok=True)
# Log full command (with model) for debugging
cmd_flags = [c for c in cmd if c.startswith("--")]
logger.info( logger.info(
f"Spawning Claude Code: cwd={self._session_dir}, " f"[TIMING] Spawning: cwd={self._session_dir.name}, "
f"cmd={' '.join(cmd[:4])}... (truncated)" f"flags={' '.join(cmd_flags)}, msg={message[:60]}..."
) )
try: try:
@ -195,7 +201,8 @@ class ClaudeSubprocess:
) )
self._busy = True self._busy = True
logger.info(f"Claude Code spawned: PID={self._process.pid}") elapsed = time.monotonic() - self._spawn_time
logger.info(f"[TIMING] Process spawned: PID={self._process.pid} (+{elapsed:.3f}s)")
# Launch concurrent stream readers # Launch concurrent stream readers
self._reader_task = asyncio.create_task(self._read_streams()) self._reader_task = asyncio.create_task(self._read_streams())
@ -229,7 +236,8 @@ class ClaudeSubprocess:
# Wait for process to terminate # Wait for process to terminate
returncode = await self._process.wait() returncode = await self._process.wait()
logger.info(f"Claude Code process exited: returncode={returncode}") elapsed = time.monotonic() - self._spawn_time
logger.info(f"[TIMING] Process exited: returncode={returncode}, total={elapsed:.3f}s")
# Handle crash (non-zero exit, but not signals from our own termination) # Handle crash (non-zero exit, but not signals from our own termination)
# Negative return codes are signals (e.g., -15 = SIGTERM, -9 = SIGKILL) # Negative return codes are signals (e.g., -15 = SIGTERM, -9 = SIGKILL)
@ -271,6 +279,7 @@ class ClaudeSubprocess:
if not self._process or not self._process.stdout: if not self._process or not self._process.stdout:
return return
first_line = True
try: try:
while True: while True:
line = await self._process.stdout.readline() line = await self._process.stdout.readline()
@ -278,6 +287,10 @@ class ClaudeSubprocess:
break break
line_str = line.decode().rstrip() line_str = line.decode().rstrip()
if line_str: if line_str:
if first_line:
elapsed = time.monotonic() - self._spawn_time
logger.info(f"[TIMING] First stdout line: +{elapsed:.3f}s")
first_line = False
self._handle_stdout_line(line_str) self._handle_stdout_line(line_str)
except Exception as e: except Exception as e:
logger.error(f"Error reading stdout: {e}") logger.error(f"Error reading stdout: {e}")
@ -322,6 +335,10 @@ class ClaudeSubprocess:
if block.get("type") == "text": if block.get("type") == "text":
text = block.get("text", "") text = block.get("text", "")
if text and self.on_output: if text and self.on_output:
if not self._first_output_time:
self._first_output_time = time.monotonic()
elapsed = self._first_output_time - self._spawn_time
logger.info(f"[TIMING] First assistant text: +{elapsed:.3f}s ({len(text)} chars)")
try: try:
if inspect.iscoroutinefunction(self.on_output): if inspect.iscoroutinefunction(self.on_output):
asyncio.create_task(self.on_output(text)) asyncio.create_task(self.on_output(text))
@ -332,9 +349,9 @@ class ClaudeSubprocess:
elif event_type == "result": elif event_type == "result":
# Turn complete # Turn complete
elapsed = time.monotonic() - self._spawn_time
session_id = event.get("session_id") session_id = event.get("session_id")
if session_id: logger.info(f"[TIMING] Result event: +{elapsed:.3f}s (session={session_id})")
logger.debug(f"Turn complete: session_id={session_id}")
# Check for error # Check for error
if event.get("is_error") and self.on_error: if event.get("is_error") and self.on_error: