From 88cd339a547f4fbf68825d4d157ba5583efb95b9 Mon Sep 17 00:00:00 2001 From: Mikkel Georgsen Date: Wed, 4 Feb 2026 23:20:04 +0000 Subject: [PATCH] docs(03): create phase plan for lifecycle management Phase 03: Lifecycle Management - 2 plans in 2 waves - Plan 01 (wave 1): Idle timer module + session metadata + PID tracking - Plan 02 (wave 2): Suspend/resume wiring, /timeout, /sessions, startup cleanup, graceful shutdown - Ready for execution Co-Authored-By: Claude Opus 4.5 --- .planning/ROADMAP.md | 9 +- .../03-lifecycle-management/03-01-PLAN.md | 133 ++++++++ .../03-lifecycle-management/03-02-PLAN.md | 311 ++++++++++++++++++ 3 files changed, 449 insertions(+), 4 deletions(-) create mode 100644 .planning/phases/03-lifecycle-management/03-01-PLAN.md create mode 100644 .planning/phases/03-lifecycle-management/03-02-PLAN.md diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 95da1e6..c7313ae 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -63,10 +63,11 @@ Plans: 3. User can change idle timeout via `/timeout ` command 4. User can list all sessions with last activity timestamp via `/sessions` command 5. Bot restart leaves no zombie processes (systemd KillMode handles cleanup) -**Plans**: TBD +**Plans:** 2 plans Plans: -- [ ] TBD +- [ ] 03-01-PLAN.md -- Idle timer module + session metadata extensions + PID tracking +- [ ] 03-02-PLAN.md -- Suspend/resume wiring, /timeout, /sessions, startup cleanup, graceful shutdown ### Phase 4: Output Modes **Goal**: Users control response verbosity and format based on context @@ -84,11 +85,11 @@ Plans: ## Progress **Execution Order:** -Phases execute in numeric order: 1 → 2 → 3 → 4 +Phases execute in numeric order: 1 -> 2 -> 3 -> 4 | Phase | Plans Complete | Status | Completed | |-------|----------------|--------|-----------| | 1. Session & Process Foundation | 3/3 | Complete | 2026-02-04 | | 2. Telegram Integration | 2/2 | Complete | 2026-02-04 | -| 3. Lifecycle Management | 0/TBD | Not started | - | +| 3. Lifecycle Management | 0/2 | In progress | - | | 4. Output Modes | 0/TBD | Not started | - | diff --git a/.planning/phases/03-lifecycle-management/03-01-PLAN.md b/.planning/phases/03-lifecycle-management/03-01-PLAN.md new file mode 100644 index 0000000..f5bf215 --- /dev/null +++ b/.planning/phases/03-lifecycle-management/03-01-PLAN.md @@ -0,0 +1,133 @@ +--- +phase: 03-lifecycle-management +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - telegram/idle_timer.py + - telegram/session_manager.py + - telegram/claude_subprocess.py +autonomous: true + +must_haves: + truths: + - "Per-session idle timer fires callback after configurable timeout seconds" + - "Timer resets on activity (cancel + restart)" + - "Session metadata includes idle_timeout field (default 600s)" + - "ClaudeSubprocess exposes its PID for metadata tracking" + artifacts: + - path: "telegram/idle_timer.py" + provides: "SessionIdleTimer class with asyncio-based per-session idle timers" + min_lines: 60 + - path: "telegram/session_manager.py" + provides: "Session metadata with idle_timeout field, PID tracking" + contains: "idle_timeout" + - path: "telegram/claude_subprocess.py" + provides: "PID property for external access" + contains: "def pid" + key_links: + - from: "telegram/idle_timer.py" + to: "asyncio.create_task" + via: "Background sleep task with cancellation" + pattern: "asyncio\\.create_task.*_wait_for_timeout" + - from: "telegram/session_manager.py" + to: "metadata.json" + via: "idle_timeout stored in session metadata" + pattern: "idle_timeout" +--- + + +Create the idle timer module and extend session metadata for lifecycle management. + +Purpose: Foundation components needed before wiring suspend/resume into the bot. The idle timer provides per-session timeout detection, and metadata extensions store timeout configuration and subprocess PIDs. +Output: New `idle_timer.py` module, updated `session_manager.py` and `claude_subprocess.py` + + + +@/home/mikkel/.claude/get-shit-done/workflows/execute-plan.md +@/home/mikkel/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/03-lifecycle-management/03-CONTEXT.md +@.planning/phases/03-lifecycle-management/03-RESEARCH.md +@telegram/idle_timer.py (will be created) +@telegram/session_manager.py +@telegram/claude_subprocess.py + + + + + + Task 1: Create SessionIdleTimer module + telegram/idle_timer.py + +Create `telegram/idle_timer.py` with a `SessionIdleTimer` class that manages per-session idle timeouts using asyncio. + +Class design: +- `__init__(self, session_name: str, timeout_seconds: int, on_timeout: Callable[[str], Awaitable[None]])` -- stores config, initializes _timer_task to None, _last_activity to now (UTC) +- `reset(self)` -- updates _last_activity to now, cancels existing _timer_task if running, creates new asyncio.create_task(_wait_for_timeout()) +- `async _wait_for_timeout(self)` -- awaits asyncio.sleep(self.timeout_seconds), then calls `await self.on_timeout(self.session_name)`. Catches asyncio.CancelledError silently (timer was reset). +- `cancel(self)` -- cancels _timer_task if running (used on shutdown/archive) +- `@property seconds_since_activity` -- returns float seconds since _last_activity +- `@property last_activity` -- returns the datetime of last activity (for /sessions display) + +Use `datetime.now(timezone.utc)` for timestamps. Import typing for Callable, Optional, Awaitable. + +Add module docstring explaining this is the idle timeout manager for session lifecycle. Log timer start/cancel/fire events at DEBUG level, timeout firing at INFO level. + + +`python3 -c "from idle_timer import SessionIdleTimer; print('import OK')"` run from telegram/ directory succeeds. + + SessionIdleTimer class exists with reset(), cancel(), _wait_for_timeout(), seconds_since_activity, and last_activity. Imports cleanly. + + + + Task 2: Extend session metadata and subprocess PID tracking + telegram/session_manager.py, telegram/claude_subprocess.py + +**session_manager.py changes:** + +1. In `create_session()`, add `"idle_timeout": 600` (10 minutes default) to the initial metadata dict (alongside existing fields like name, created, last_active, persona, pid, status). + +2. Add a helper method `get_session_timeout(self, name: str) -> int` that reads metadata and returns `metadata.get('idle_timeout', 600)`. This provides a clean interface for the bot to query timeout values. + +3. No changes to list_sessions() -- it already returns full metadata which will now include idle_timeout. + +**claude_subprocess.py changes:** + +1. Add a `@property pid(self) -> Optional[int]` that returns `self._process.pid if self._process and self._process.returncode is None else None`. This lets the bot store the PID in session metadata for orphan cleanup on restart. + +2. In `start()`, after successful subprocess spawn, store the PID in a `self._pid` attribute as well (for access even after process terminates, useful for logging). Keep the property returning live PID only. + +These are minimal, targeted changes. Do NOT refactor existing code. Do NOT change the terminate() method or any existing logic. + + +`python3 -c "from session_manager import SessionManager; sm = SessionManager(); print('SM OK')"` and `python3 -c "from claude_subprocess import ClaudeSubprocess; print('CS OK')"` both succeed from telegram/ directory. + + Session metadata includes idle_timeout (default 600s). SessionManager has get_session_timeout() method. ClaudeSubprocess has pid property returning live process PID. + + + + + +- `cd ~/homelab/telegram && python3 -c "from idle_timer import SessionIdleTimer; from session_manager import SessionManager; from claude_subprocess import ClaudeSubprocess; print('All imports OK')"` +- SessionIdleTimer has reset(), cancel(), seconds_since_activity, last_activity +- SessionManager.get_session_timeout() returns int +- ClaudeSubprocess.pid returns Optional[int] + + + +- idle_timer.py exists with SessionIdleTimer class implementing asyncio-based per-session idle timeout +- session_manager.py creates sessions with idle_timeout=600 in metadata and has get_session_timeout() helper +- claude_subprocess.py exposes pid property for PID tracking +- All three modules import without errors + + + +After completion, create `.planning/phases/03-lifecycle-management/03-01-SUMMARY.md` + diff --git a/.planning/phases/03-lifecycle-management/03-02-PLAN.md b/.planning/phases/03-lifecycle-management/03-02-PLAN.md new file mode 100644 index 0000000..fa400d0 --- /dev/null +++ b/.planning/phases/03-lifecycle-management/03-02-PLAN.md @@ -0,0 +1,311 @@ +--- +phase: 03-lifecycle-management +plan: 02 +type: execute +wave: 2 +depends_on: ["03-01"] +files_modified: + - telegram/bot.py +autonomous: true + +must_haves: + truths: + - "Session suspends automatically after idle timeout (subprocess terminated, status set to suspended)" + - "User message to suspended session resumes it with --continue and shows 'Resuming session...' status" + - "Resume failure sends error to user and does not auto-create fresh session" + - "Race between timeout-fire and user-message is prevented by asyncio.Lock" + - "Bot startup kills orphaned subprocess PIDs and sets all sessions to suspended" + - "Bot shutdown terminates all subprocesses gracefully (SIGTERM + 5s timeout + SIGKILL)" + - "/timeout sets per-session idle timeout (1-120 range)" + - "/sessions lists all sessions with status indicator, persona, and last active time" + artifacts: + - path: "telegram/bot.py" + provides: "Suspend/resume wiring, idle timers, /timeout, /sessions, startup cleanup, graceful shutdown" + contains: "idle_timers" + key_links: + - from: "telegram/bot.py" + to: "telegram/idle_timer.py" + via: "import and instantiate SessionIdleTimer per session" + pattern: "from idle_timer import SessionIdleTimer" + - from: "telegram/bot.py on_complete callback" + to: "idle_timer.reset()" + via: "Timer starts after Claude finishes processing" + pattern: "idle_timers.*reset" + - from: "telegram/bot.py handle_message" + to: "resume logic" + via: "Detect suspended session, spawn with --continue, send status" + pattern: "Resuming session" + - from: "telegram/bot.py suspend_session" + to: "ClaudeSubprocess.terminate()" + via: "Idle timer fires, terminates subprocess" + pattern: "await.*terminate" +--- + + +Wire suspend/resume lifecycle, idle timers, new commands, and cleanup into the bot. + +Purpose: This is the core integration plan that makes sessions automatically suspend after idle timeout, resume transparently on user message, and provides /timeout + /sessions commands. Also adds startup orphan cleanup and graceful shutdown signal handling. +Output: Updated `bot.py` with full lifecycle management + + + +@/home/mikkel/.claude/get-shit-done/workflows/execute-plan.md +@/home/mikkel/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/03-lifecycle-management/03-CONTEXT.md +@.planning/phases/03-lifecycle-management/03-RESEARCH.md +@.planning/phases/03-lifecycle-management/03-01-SUMMARY.md +@telegram/bot.py +@telegram/idle_timer.py +@telegram/session_manager.py +@telegram/claude_subprocess.py + + + + + + Task 1: Suspend/resume wiring with race locks, startup cleanup, and graceful shutdown + telegram/bot.py + +This is the core lifecycle wiring in bot.py. Make these changes: + +**New imports and globals:** +- `import signal, os` (for shutdown handlers and PID checks) +- `from idle_timer import SessionIdleTimer` +- Add global dict: `idle_timers: dict[str, SessionIdleTimer] = {}` +- Add global dict: `subprocess_locks: dict[str, asyncio.Lock] = {}` (one lock per session, prevents races between timeout-fire and user-message) + +**Helper: get_subprocess_lock(session_name)** +- Returns existing lock or creates new one for session. Pattern: `subprocess_locks.setdefault(session_name, asyncio.Lock())` + +**Suspend function: `async def suspend_session(session_name: str)`** +- This is the idle timer's on_timeout callback. +- Acquire the session's subprocess lock. +- Check if subprocess exists and is_alive. If not alive, just update metadata and return. +- Check `subprocesses[session_name].is_busy` -- if busy, DON'T suspend (Claude is mid-processing). Instead, reset the idle timer to try again later. Log this. Return. +- Store the subprocess PID for logging. +- Call `await subprocesses[session_name].terminate()` (existing method with SIGTERM + timeout + SIGKILL). +- Remove from `subprocesses` dict. +- Flush and remove batcher if exists: `if session_name in batchers: await batchers[session_name].flush_immediately(); del batchers[session_name]` +- Update session metadata: `session_manager.update_session(session_name, status='suspended', pid=None)` +- Cancel and remove idle timer: `if session_name in idle_timers: idle_timers[session_name].cancel(); del idle_timers[session_name]` +- Log: `logger.info(f"Session '{session_name}' suspended after idle timeout")` +- DECISION (from CONTEXT.md): Silent suspension -- do NOT send any Telegram message. + +**Modify make_callbacks() -- add on_complete idle timer integration:** +- The `on_complete` callback already exists. Wrap it: after existing logic (stop typing), add idle timer reset: + ```python + # Reset idle timer (only start counting AFTER Claude finishes) + if session_name in idle_timers: + idle_timers[session_name].reset() + ``` +- This ensures timer only starts when Claude is truly idle, never during processing. + +**Modify handle_message() -- add resume logic:** +- After checking for active session, BEFORE the subprocess check, add: + ```python + # Acquire lock to prevent race with suspend_session + lock = get_subprocess_lock(active_session) + async with lock: + ``` + Wrap the subprocess get-or-create and message send in this lock. +- Inside the lock, when subprocess is not alive: + 1. Check if session has `.claude/` dir (has history). If yes, this is a resume. + 2. If resuming: send status message to user: `"Resuming session..."` (include idle duration if >1 min from metadata last_active). Example: `"Resuming session (idle for 15 min)..."` + 3. Spawn subprocess normally (the existing ClaudeSubprocess constructor + start() already handles --continue when .claude/ exists). + 4. Store PID in metadata: `session_manager.update_session(active_session, status='active', last_active=now_iso, pid=subprocesses[active_session].pid)` +- After sending message (outside lock), create/reset idle timer for the session: + ```python + timeout_secs = session_manager.get_session_timeout(active_session) + if active_session not in idle_timers: + idle_timers[active_session] = SessionIdleTimer(active_session, timeout_secs, on_timeout=suspend_session) + # Don't reset here -- timer resets in on_complete when Claude finishes + ``` +- IMPORTANT: Also reset the idle timer when user sends a message (user activity should reset timer too, per CONTEXT.md): + ```python + if active_session in idle_timers: + idle_timers[active_session].reset() + ``` + Put this BEFORE sending to subprocess (so timer is reset even if message queues). + +**Similarly update handle_photo() and handle_document():** +- Add the same lock acquisition, resume detection, and idle timer reset as handle_message(). +- Keep the existing photo/document save and notification logic. + +**Modify new_session() -- initialize idle timer after creation:** +- After subprocess creation, add: + ```python + timeout_secs = session_manager.get_session_timeout(name) + idle_timers[name] = SessionIdleTimer(name, timeout_secs, on_timeout=suspend_session) + ``` +- Store PID in metadata: after subprocess is created/started, `session_manager.update_session(name, pid=subprocesses[name].pid)` (only after start()). + Note: The existing code creates ClaudeSubprocess but does NOT call start() -- start happens lazily on first send_message. So PID tracking happens in handle_message when subprocess auto-starts. + +**Modify switch_session_cmd():** +- Per CONTEXT.md LOCKED decision: switching sessions leaves previous subprocess running (it suspends on its own timer). Do NOT cancel old session's idle timer. +- When auto-spawning subprocess for new session, set up idle timer as above. + +**Modify archive_session_cmd():** +- Cancel idle timer if exists: `if name in idle_timers: idle_timers[name].cancel(); del idle_timers[name]` +- Remove subprocess lock if exists: `subprocess_locks.pop(name, None)` + +**Modify model_cmd():** +- After terminating subprocess for model change, cancel idle timer: `if active_session in idle_timers: idle_timers[active_session].cancel(); del idle_timers[active_session]` + +**Startup cleanup function: `async def cleanup_orphaned_subprocesses()`** +- Called once at bot startup (before polling starts). +- Iterate all sessions via `session_manager.list_sessions()`. +- For each session with a non-None `pid`: + 1. Check if PID process exists: `os.kill(pid, 0)` wrapped in try/except ProcessLookupError. + 2. If process exists, verify it's a claude process: read `/proc/{pid}/cmdline`, check if "claude" is in it. If not claude, skip killing. + 3. If it IS a claude process: `os.kill(pid, signal.SIGTERM)`, sleep 2s, then try `os.kill(pid, signal.SIGKILL)` (catch ProcessLookupError if already dead). + 4. Update metadata: `session_manager.update_session(session['name'], pid=None, status='suspended')` +- For sessions with status != 'suspended' and no pid, also set status to 'suspended'. +- Log summary: "Cleaned up N orphaned subprocesses" + +**Graceful shutdown:** +- python-telegram-bot's `Application.run_polling()` handles signal installation internally. Instead of overriding signal handlers (which conflicts with the library), use the `post_shutdown` callback: + ```python + async def post_shutdown(application): + """Clean up subprocesses and timers on bot shutdown.""" + logger.info("Bot shutting down, cleaning up...") + + # Cancel all idle timers + for name, timer in idle_timers.items(): + timer.cancel() + + # Terminate all subprocesses + for name, proc in list(subprocesses.items()): + if proc.is_alive: + logger.info(f"Terminating subprocess for '{name}'") + await proc.terminate() + + logger.info("Cleanup complete") + ``` +- Register in main(): `app.post_shutdown = post_shutdown` +- Also add a `post_init` callback for startup cleanup: + ```python + async def post_init(application): + """Run startup cleanup.""" + await cleanup_orphaned_subprocesses() + ``` + Register: `app = Application.builder().token(TOKEN).post_init(post_init).build()` + +**Update help text:** +- Add `/timeout ` and `/sessions` to the help_command text under "Claude Sessions" section. + + +`python3 -c "import bot"` from telegram/ directory should not error (syntax check). Look for: idle_timers dict, subprocess_locks dict, suspend_session function, cleanup_orphaned_subprocesses function, post_shutdown callback. + + +- suspend_session() terminates subprocess on idle timeout, updates metadata to suspended, silent (no Telegram notification) +- handle_message() detects suspended session, sends "Resuming session..." status, spawns with --continue +- Race lock prevents concurrent suspend + resume on same session +- Startup cleanup kills orphaned PIDs verified via /proc/cmdline +- Graceful shutdown terminates all subprocesses and cancels all timers +- handle_photo/handle_document also support resume from suspended state + + + + + Task 2: /timeout and /sessions commands + telegram/bot.py + +Add two new command handlers to bot.py: + +**/timeout command: `async def timeout_cmd(update, context)`** +- Auth check (same pattern as other commands). +- If no active session: reply "No active session. Use /new to start one." +- If no args: show current timeout. + ```python + timeout_secs = session_manager.get_session_timeout(active_session) + minutes = timeout_secs // 60 + await update.message.reply_text(f"Idle timeout: {minutes} minutes\n\nUsage: /timeout (1-120)") + ``` +- If args: parse first arg as int. + - Validate range 1-120. If out of range: `"Timeout must be between 1 and 120 minutes"` + - If not a valid int: `"Invalid number. Usage: /timeout "` + - Convert to seconds: `timeout_seconds = minutes * 60` + - Update session metadata: `session_manager.update_session(active_session, idle_timeout=timeout_seconds)` + - If idle timer exists for this session, update its timeout_seconds attribute and reset: `idle_timers[active_session].timeout_seconds = timeout_seconds; idle_timers[active_session].reset()` + - Reply: `f"Idle timeout set to {minutes} minutes for session '{active_session}'."` + +**/sessions command: `async def sessions_cmd(update, context)`** +- Auth check. +- Get all sessions: `session_manager.list_sessions()` (already sorted by last_active desc). +- If empty: reply "No sessions. Use /new to create one." +- Build formatted list. For each session: + - Status indicator: active subprocess running -> "LIVE", status == "active" (in metadata) -> "ACTIVE", status == "suspended" -> "IDLE", else -> status + - Actually, check real subprocess state: `name in subprocesses and subprocesses[name].is_alive` -> "LIVE" + - Format last_active as relative time (e.g., "2m ago", "1h ago", "3d ago") using a small helper function: + ```python + def format_relative_time(iso_str): + dt = datetime.fromisoformat(iso_str) + delta = datetime.now(timezone.utc) - dt + secs = delta.total_seconds() + if secs < 60: return "just now" + if secs < 3600: return f"{int(secs/60)}m ago" + if secs < 86400: return f"{int(secs/3600)}h ago" + return f"{int(secs/86400)}d ago" + ``` + - Mark current active session with arrow prefix. + - Format line: `"{marker}{status_emoji} {name} ({persona}) - {relative_time}"` + - Status emojis: LIVE -> green circle, IDLE/suspended -> white circle +- Join lines, reply with parse_mode='Markdown'. Use backticks around session names for monospace. + +**Register handlers in main():** +- `app.add_handler(CommandHandler("timeout", timeout_cmd))` -- after the model handler +- `app.add_handler(CommandHandler("sessions", sessions_cmd))` -- after the session handler + +**Update help text in help_command():** +- Under "Claude Sessions" section, add: + - `/sessions` - List all sessions with status + - `/timeout ` - Set idle timeout (1-120) + + +`python3 -c "import bot; print('OK')"` succeeds. Grep for "timeout_cmd" and "sessions_cmd" in bot.py to confirm both exist. Grep for "CommandHandler.*timeout" and "CommandHandler.*sessions" to confirm registration. + + +- /timeout shows current timeout when called without args, sets timeout (1-120 min range) when called with arg +- /sessions lists all sessions sorted by last active, showing live/idle status, persona, relative time +- Both commands registered as handlers in main() +- Help text updated with new commands + + + + + + +1. `cd ~/homelab/telegram && python3 -c "import bot; print('All OK')"` -- no import errors +2. Grep for key integration points: + - `grep -n "suspend_session" telegram/bot.py` -- suspend function exists + - `grep -n "idle_timers" telegram/bot.py` -- idle timer dict used + - `grep -n "subprocess_locks" telegram/bot.py` -- race locks exist + - `grep -n "cleanup_orphaned" telegram/bot.py` -- startup cleanup exists + - `grep -n "post_shutdown" telegram/bot.py` -- graceful shutdown exists + - `grep -n "Resuming session" telegram/bot.py` -- resume status message exists + - `grep -n "timeout_cmd\|sessions_cmd" telegram/bot.py` -- new commands exist +3. Restart bot service: `systemctl --user restart telegram-bot.service && sleep 2 && systemctl --user status telegram-bot.service` -- should show active + + + +- Session auto-suspends after idle timeout (subprocess terminated, metadata status=suspended, no Telegram notification) +- Message to suspended session shows "Resuming session..." then Claude responds with full history +- If resume fails, error message sent (no auto-fresh-start) +- asyncio.Lock prevents race between timeout-fire and incoming message +- Bot startup kills orphaned subprocess PIDs (verified via /proc/cmdline) +- Bot shutdown terminates all subprocesses gracefully +- /timeout sets per-session idle timeout (1-120 range), shows current value without args +- /sessions lists all sessions with LIVE/IDLE status, persona, and relative last-active time +- Help text includes new commands +- Bot service restarts cleanly + + + +After completion, create `.planning/phases/03-lifecycle-management/03-02-SUMMARY.md` +