Implement idle timeout reaper and liveness visibility for hung runs

Add lastOutputAt tracking on heartbeat runs, updated from adapter stdout/stderr
with batched DB writes (30s debounce). The orphan reaper now checks running
processes for idle timeout: warns at 10 min, kills (SIGTERM/SIGKILL) at 15 min.

- DB: add last_output_at column to heartbeat_runs (migration 0046)
- Server: idle warning/kill logic in reapOrphanedRuns, lastOutputAt flush in onLog
- Server: clear idle_warning on activity report (clearDetachedRunWarning)
- API: expose errorCode and lastOutputAt on active/recent run endpoints
- UI: orange idle warning badge on ActiveAgentsPanel and LiveRunWidget
- UI: show last output relative time in LiveRunWidget
- UI: add idle_timeout status color

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Devin Foley 2026-03-26 23:05:47 -07:00
parent af5b980362
commit 1b70091dcd
12 changed files with 12040 additions and 8 deletions

View file

@ -0,0 +1 @@
ALTER TABLE "heartbeat_runs" ADD COLUMN "last_output_at" timestamp with time zone;

File diff suppressed because it is too large Load diff

View file

@ -323,6 +323,13 @@
"when": 1774530504348,
"tag": "0045_workable_shockwave",
"breakpoints": true
},
{
"idx": 46,
"version": "7",
"when": 1774589600897,
"tag": "0046_lethal_starfox",
"breakpoints": true
}
]
}

View file

@ -33,6 +33,7 @@ export const heartbeatRuns = pgTable(
externalRunId: text("external_run_id"),
processPid: integer("process_pid"),
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
lastOutputAt: timestamp("last_output_at", { withTimezone: true }),
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
onDelete: "set null",
}),

View file

@ -35,6 +35,7 @@ export interface HeartbeatRun {
externalRunId: string | null;
processPid: number | null;
processStartedAt: Date | null;
lastOutputAt: Date | null;
retryOfRunId: string | null;
processLossRetryCount: number;
contextSnapshot: Record<string, unknown> | null;

View file

@ -73,6 +73,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
includeIssue?: boolean;
runErrorCode?: string | null;
runError?: string | null;
lastOutputAt?: Date | null;
startedAt?: Date | null;
}) {
const companyId = randomUUID();
const agentId = randomUUID();

View file

@ -2083,6 +2083,8 @@ export function agentRoutes(db: Db) {
agentId: heartbeatRuns.agentId,
agentName: agentsTable.name,
adapterType: agentsTable.adapterType,
errorCode: heartbeatRuns.errorCode,
lastOutputAt: heartbeatRuns.lastOutputAt,
issueId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'issueId'`.as("issueId"),
};
@ -2250,6 +2252,8 @@ export function agentRoutes(db: Db) {
agentId: heartbeatRuns.agentId,
agentName: agentsTable.name,
adapterType: agentsTable.adapterType,
errorCode: heartbeatRuns.errorCode,
lastOutputAt: heartbeatRuns.lastOutputAt,
})
.from(heartbeatRuns)
.innerJoin(agentsTable, eq(heartbeatRuns.agentId, agentsTable.id))

View file

@ -63,6 +63,10 @@ const HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT = 1;
const HEARTBEAT_MAX_CONCURRENT_RUNS_MAX = 10;
const DEFERRED_WAKE_CONTEXT_KEY = "_paperclipWakeContext";
const DETACHED_PROCESS_ERROR_CODE = "process_detached";
const IDLE_WARNING_ERROR_CODE = "idle_warning";
const IDLE_WARNING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
const IDLE_KILL_THRESHOLD_MS = 15 * 60 * 1000; // 15 minutes
const IDLE_KILL_GRACE_SEC = 10; // seconds between SIGTERM and SIGKILL
const startLocksByAgent = new Map<string, Promise<void>>();
const REPO_ONLY_CWD_SENTINEL = "/__paperclip_repo_only__";
const MANAGED_WORKSPACE_GIT_CLONE_TIMEOUT_MS = 10 * 60 * 1000;
@ -1484,23 +1488,30 @@ export function heartbeatService(db: Db) {
}
async function clearDetachedRunWarning(runId: string) {
// Clear both detached process warnings and idle warnings when activity is reported
const updated = await db
.update(heartbeatRuns)
.set({
error: null,
errorCode: null,
lastOutputAt: new Date(),
updatedAt: new Date(),
})
.where(and(eq(heartbeatRuns.id, runId), eq(heartbeatRuns.status, "running"), eq(heartbeatRuns.errorCode, DETACHED_PROCESS_ERROR_CODE)))
.where(and(
eq(heartbeatRuns.id, runId),
eq(heartbeatRuns.status, "running"),
inArray(heartbeatRuns.errorCode, [DETACHED_PROCESS_ERROR_CODE, IDLE_WARNING_ERROR_CODE]),
))
.returning()
.then((rows) => rows[0] ?? null);
if (!updated) return null;
const wasIdle = updated.errorCode === null; // errorCode was cleared
await appendRunEvent(updated, await nextRunEventSeq(updated.id), {
eventType: "lifecycle",
stream: "system",
level: "info",
message: "Detached child process reported activity; cleared detached warning",
message: "Activity reported; cleared run warning",
});
return updated;
}
@ -1826,7 +1837,107 @@ export function heartbeatService(db: Db) {
if (reaped.length > 0) {
logger.warn({ reapedCount: reaped.length, runIds: reaped }, "reaped orphaned heartbeat runs");
}
return { reaped: reaped.length, runIds: reaped };
// ── Idle-timeout pass: check all running runs for stalled output ──
const idleWarned: string[] = [];
const idleKilled: string[] = [];
const allRunningRuns = await db
.select({
run: heartbeatRuns,
adapterType: agents.adapterType,
})
.from(heartbeatRuns)
.innerJoin(agents, eq(heartbeatRuns.agentId, agents.id))
.where(eq(heartbeatRuns.status, "running"));
for (const { run, adapterType } of allRunningRuns) {
const tracksLocalChild = isTrackedLocalChildProcessAdapter(adapterType);
if (!tracksLocalChild) continue;
// Determine the most recent activity timestamp
const lastActivity = run.lastOutputAt
? new Date(run.lastOutputAt).getTime()
: run.startedAt
? new Date(run.startedAt).getTime()
: new Date(run.createdAt).getTime();
const idleMs = now.getTime() - lastActivity;
if (idleMs >= IDLE_KILL_THRESHOLD_MS) {
// Kill: SIGTERM then SIGKILL
const tracked = runningProcesses.get(run.id);
const pid = tracked?.child?.pid ?? run.processPid;
const killMessage = `Idle timeout: no output for ${Math.round(idleMs / 60_000)} minutes — killing process (pid ${pid ?? "unknown"})`;
logger.warn({ runId: run.id, pid, idleMs }, killMessage);
if (tracked?.child) {
tracked.child.kill("SIGTERM");
setTimeout(() => {
try { if (!tracked.child.killed) tracked.child.kill("SIGKILL"); } catch {}
}, IDLE_KILL_GRACE_SEC * 1000);
} else if (pid && isProcessAlive(pid)) {
try { process.kill(pid, "SIGTERM"); } catch {}
setTimeout(() => {
try { if (isProcessAlive(pid)) process.kill(pid, "SIGKILL"); } catch {}
}, IDLE_KILL_GRACE_SEC * 1000);
}
// Mark the run as failed
let killedRun = await setRunStatus(run.id, "failed", {
error: killMessage,
errorCode: "idle_timeout",
finishedAt: now,
});
await setWakeupStatus(run.wakeupRequestId, "failed", {
finishedAt: now,
error: killMessage,
});
if (!killedRun) killedRun = await getRun(run.id);
if (killedRun) {
await appendRunEvent(killedRun, await nextRunEventSeq(killedRun.id), {
eventType: "lifecycle",
stream: "system",
level: "error",
message: killMessage,
payload: { pid, idleMs },
});
await releaseIssueExecutionAndPromote(killedRun);
}
await finalizeAgentStatus(run.agentId, "failed");
await startNextQueuedRunForAgent(run.agentId);
runningProcesses.delete(run.id);
idleKilled.push(run.id);
} else if (idleMs >= IDLE_WARNING_THRESHOLD_MS && run.errorCode !== IDLE_WARNING_ERROR_CODE) {
// Warning: approaching idle timeout
const warnMessage = `Idle warning: no output for ${Math.round(idleMs / 60_000)} minutes — will be killed at ${IDLE_KILL_THRESHOLD_MS / 60_000} min`;
logger.warn({ runId: run.id, idleMs }, warnMessage);
const warnedRun = await setRunStatus(run.id, "running", {
error: warnMessage,
errorCode: IDLE_WARNING_ERROR_CODE,
});
if (warnedRun) {
await appendRunEvent(warnedRun, await nextRunEventSeq(warnedRun.id), {
eventType: "lifecycle",
stream: "system",
level: "warn",
message: warnMessage,
payload: { idleMs },
});
}
idleWarned.push(run.id);
}
}
if (idleWarned.length > 0) {
logger.info({ count: idleWarned.length, runIds: idleWarned }, "idle-warned runs");
}
if (idleKilled.length > 0) {
logger.warn({ count: idleKilled.length, runIds: idleKilled }, "idle-killed runs");
}
return { reaped: reaped.length, runIds: reaped, idleWarned: idleWarned.length, idleKilled: idleKilled.length };
}
async function resumeQueuedRuns() {
@ -2397,12 +2508,30 @@ export function heartbeatService(db: Db) {
.where(eq(heartbeatRuns.id, runId));
const currentUserRedactionOptions = await getCurrentUserRedactionOptions();
let lastOutputAtFlushPending = false;
const onLog = async (stream: "stdout" | "stderr", chunk: string) => {
const sanitizedChunk = redactCurrentUserText(chunk, currentUserRedactionOptions);
if (stream === "stdout") stdoutExcerpt = appendExcerpt(stdoutExcerpt, sanitizedChunk);
if (stream === "stderr") stderrExcerpt = appendExcerpt(stderrExcerpt, sanitizedChunk);
const ts = new Date().toISOString();
// Batch lastOutputAt writes — flush at most once per 30 seconds to avoid DB churn
if (!lastOutputAtFlushPending) {
lastOutputAtFlushPending = true;
setTimeout(() => {
lastOutputAtFlushPending = false;
db.update(heartbeatRuns)
.set({ lastOutputAt: new Date(), updatedAt: new Date() })
.where(eq(heartbeatRuns.id, runId))
.then(() => {})
.catch((err) => logger.warn({ err, runId }, "failed to flush lastOutputAt"));
}, 30_000);
// Also flush immediately on first output
await db.update(heartbeatRuns)
.set({ lastOutputAt: new Date(ts), updatedAt: new Date() })
.where(eq(heartbeatRuns.id, runId));
}
if (handle) {
await runLogStore.append(handle, {
stream,

View file

@ -23,6 +23,8 @@ export interface LiveRunForIssue {
agentId: string;
agentName: string;
adapterType: string;
errorCode?: string | null;
lastOutputAt?: string | null;
issueId?: string | null;
}

View file

@ -92,9 +92,11 @@ function AgentRunCard({
return (
<div className={cn(
"flex h-[320px] flex-col overflow-hidden rounded-xl border shadow-sm",
isActive
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
: "border-border bg-background/70",
run.errorCode === "idle_warning"
? "border-orange-500/25 bg-orange-500/[0.04] shadow-[0_16px_40px_rgba(249,115,22,0.08)]"
: isActive
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
: "border-border bg-background/70",
)}>
<div className="border-b border-border/60 px-3 py-3">
<div className="flex items-start justify-between gap-2">
@ -112,6 +114,11 @@ function AgentRunCard({
</div>
<div className="mt-2 flex items-center gap-2 text-[11px] text-muted-foreground">
<span>{isActive ? "Live now" : run.finishedAt ? `Finished ${relativeTime(run.finishedAt)}` : `Started ${relativeTime(run.createdAt)}`}</span>
{run.errorCode === "idle_warning" && (
<span className="rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-1.5 py-0.5 text-[10px] font-medium text-orange-700 dark:text-orange-300">
Idle
</span>
)}
</div>
</div>

View file

@ -4,7 +4,7 @@ import { useQuery, useQueryClient } from "@tanstack/react-query";
import { heartbeatsApi, type LiveRunForIssue } from "../api/heartbeats";
import { queryKeys } from "../lib/queryKeys";
import { formatDateTime } from "../lib/utils";
import { ExternalLink, Square } from "lucide-react";
import { AlertTriangle, ExternalLink, Square } from "lucide-react";
import { Identity } from "./Identity";
import { StatusBadge } from "./StatusBadge";
import { RunTranscriptView } from "./transcript/RunTranscriptView";
@ -24,6 +24,15 @@ function isRunActive(status: string): boolean {
return status === "queued" || status === "running";
}
function formatRelativeTime(iso: string): string {
const ms = Date.now() - new Date(iso).getTime();
if (ms < 60_000) return "just now";
const min = Math.floor(ms / 60_000);
if (min < 60) return `${min}m ago`;
const hr = Math.floor(min / 60);
return `${hr}h ${min % 60}m ago`;
}
export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
const queryClient = useQueryClient();
const [cancellingRunIds, setCancellingRunIds] = useState(new Set<string>());
@ -116,7 +125,18 @@ export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
{run.id.slice(0, 8)}
</Link>
<StatusBadge status={run.status} />
<span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>
{run.errorCode === "idle_warning" && (
<span className="inline-flex items-center gap-1 rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-2 py-0.5 text-[11px] font-medium text-orange-700 dark:text-orange-300">
<AlertTriangle className="h-3 w-3" />
Idle
</span>
)}
{run.lastOutputAt && (
<span title={`Last output: ${formatDateTime(run.lastOutputAt)}`}>
Last output {formatRelativeTime(run.lastOutputAt)}
</span>
)}
{!run.lastOutputAt && <span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>}
</div>
</div>

View file

@ -55,6 +55,7 @@ export const statusBadge: Record<string, string> = {
// Run statuses
failed: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
timed_out: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
idle_timeout: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
succeeded: "bg-green-100 text-green-700 dark:bg-green-900/50 dark:text-green-300",
error: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
terminated: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",