Implement idle timeout reaper and liveness visibility for hung runs
Add lastOutputAt tracking on heartbeat runs, updated from adapter stdout/stderr with batched DB writes (30s debounce). The orphan reaper now checks running processes for idle timeout: warns at 10 min, kills (SIGTERM/SIGKILL) at 15 min. - DB: add last_output_at column to heartbeat_runs (migration 0046) - Server: idle warning/kill logic in reapOrphanedRuns, lastOutputAt flush in onLog - Server: clear idle_warning on activity report (clearDetachedRunWarning) - API: expose errorCode and lastOutputAt on active/recent run endpoints - UI: orange idle warning badge on ActiveAgentsPanel and LiveRunWidget - UI: show last output relative time in LiveRunWidget - UI: add idle_timeout status color Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
parent
af5b980362
commit
1b70091dcd
12 changed files with 12040 additions and 8 deletions
1
packages/db/src/migrations/0046_lethal_starfox.sql
Normal file
1
packages/db/src/migrations/0046_lethal_starfox.sql
Normal file
|
|
@ -0,0 +1 @@
|
|||
ALTER TABLE "heartbeat_runs" ADD COLUMN "last_output_at" timestamp with time zone;
|
||||
11857
packages/db/src/migrations/meta/0046_snapshot.json
Normal file
11857
packages/db/src/migrations/meta/0046_snapshot.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -323,6 +323,13 @@
|
|||
"when": 1774530504348,
|
||||
"tag": "0045_workable_shockwave",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 46,
|
||||
"version": "7",
|
||||
"when": 1774589600897,
|
||||
"tag": "0046_lethal_starfox",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -33,6 +33,7 @@ export const heartbeatRuns = pgTable(
|
|||
externalRunId: text("external_run_id"),
|
||||
processPid: integer("process_pid"),
|
||||
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
|
||||
lastOutputAt: timestamp("last_output_at", { withTimezone: true }),
|
||||
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
|
||||
onDelete: "set null",
|
||||
}),
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ export interface HeartbeatRun {
|
|||
externalRunId: string | null;
|
||||
processPid: number | null;
|
||||
processStartedAt: Date | null;
|
||||
lastOutputAt: Date | null;
|
||||
retryOfRunId: string | null;
|
||||
processLossRetryCount: number;
|
||||
contextSnapshot: Record<string, unknown> | null;
|
||||
|
|
|
|||
|
|
@ -73,6 +73,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
|||
includeIssue?: boolean;
|
||||
runErrorCode?: string | null;
|
||||
runError?: string | null;
|
||||
lastOutputAt?: Date | null;
|
||||
startedAt?: Date | null;
|
||||
}) {
|
||||
const companyId = randomUUID();
|
||||
const agentId = randomUUID();
|
||||
|
|
|
|||
|
|
@ -2083,6 +2083,8 @@ export function agentRoutes(db: Db) {
|
|||
agentId: heartbeatRuns.agentId,
|
||||
agentName: agentsTable.name,
|
||||
adapterType: agentsTable.adapterType,
|
||||
errorCode: heartbeatRuns.errorCode,
|
||||
lastOutputAt: heartbeatRuns.lastOutputAt,
|
||||
issueId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'issueId'`.as("issueId"),
|
||||
};
|
||||
|
||||
|
|
@ -2250,6 +2252,8 @@ export function agentRoutes(db: Db) {
|
|||
agentId: heartbeatRuns.agentId,
|
||||
agentName: agentsTable.name,
|
||||
adapterType: agentsTable.adapterType,
|
||||
errorCode: heartbeatRuns.errorCode,
|
||||
lastOutputAt: heartbeatRuns.lastOutputAt,
|
||||
})
|
||||
.from(heartbeatRuns)
|
||||
.innerJoin(agentsTable, eq(heartbeatRuns.agentId, agentsTable.id))
|
||||
|
|
|
|||
|
|
@ -63,6 +63,10 @@ const HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT = 1;
|
|||
const HEARTBEAT_MAX_CONCURRENT_RUNS_MAX = 10;
|
||||
const DEFERRED_WAKE_CONTEXT_KEY = "_paperclipWakeContext";
|
||||
const DETACHED_PROCESS_ERROR_CODE = "process_detached";
|
||||
const IDLE_WARNING_ERROR_CODE = "idle_warning";
|
||||
const IDLE_WARNING_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
|
||||
const IDLE_KILL_THRESHOLD_MS = 15 * 60 * 1000; // 15 minutes
|
||||
const IDLE_KILL_GRACE_SEC = 10; // seconds between SIGTERM and SIGKILL
|
||||
const startLocksByAgent = new Map<string, Promise<void>>();
|
||||
const REPO_ONLY_CWD_SENTINEL = "/__paperclip_repo_only__";
|
||||
const MANAGED_WORKSPACE_GIT_CLONE_TIMEOUT_MS = 10 * 60 * 1000;
|
||||
|
|
@ -1484,23 +1488,30 @@ export function heartbeatService(db: Db) {
|
|||
}
|
||||
|
||||
async function clearDetachedRunWarning(runId: string) {
|
||||
// Clear both detached process warnings and idle warnings when activity is reported
|
||||
const updated = await db
|
||||
.update(heartbeatRuns)
|
||||
.set({
|
||||
error: null,
|
||||
errorCode: null,
|
||||
lastOutputAt: new Date(),
|
||||
updatedAt: new Date(),
|
||||
})
|
||||
.where(and(eq(heartbeatRuns.id, runId), eq(heartbeatRuns.status, "running"), eq(heartbeatRuns.errorCode, DETACHED_PROCESS_ERROR_CODE)))
|
||||
.where(and(
|
||||
eq(heartbeatRuns.id, runId),
|
||||
eq(heartbeatRuns.status, "running"),
|
||||
inArray(heartbeatRuns.errorCode, [DETACHED_PROCESS_ERROR_CODE, IDLE_WARNING_ERROR_CODE]),
|
||||
))
|
||||
.returning()
|
||||
.then((rows) => rows[0] ?? null);
|
||||
if (!updated) return null;
|
||||
|
||||
const wasIdle = updated.errorCode === null; // errorCode was cleared
|
||||
await appendRunEvent(updated, await nextRunEventSeq(updated.id), {
|
||||
eventType: "lifecycle",
|
||||
stream: "system",
|
||||
level: "info",
|
||||
message: "Detached child process reported activity; cleared detached warning",
|
||||
message: "Activity reported; cleared run warning",
|
||||
});
|
||||
return updated;
|
||||
}
|
||||
|
|
@ -1826,7 +1837,107 @@ export function heartbeatService(db: Db) {
|
|||
if (reaped.length > 0) {
|
||||
logger.warn({ reapedCount: reaped.length, runIds: reaped }, "reaped orphaned heartbeat runs");
|
||||
}
|
||||
return { reaped: reaped.length, runIds: reaped };
|
||||
|
||||
// ── Idle-timeout pass: check all running runs for stalled output ──
|
||||
const idleWarned: string[] = [];
|
||||
const idleKilled: string[] = [];
|
||||
|
||||
const allRunningRuns = await db
|
||||
.select({
|
||||
run: heartbeatRuns,
|
||||
adapterType: agents.adapterType,
|
||||
})
|
||||
.from(heartbeatRuns)
|
||||
.innerJoin(agents, eq(heartbeatRuns.agentId, agents.id))
|
||||
.where(eq(heartbeatRuns.status, "running"));
|
||||
|
||||
for (const { run, adapterType } of allRunningRuns) {
|
||||
const tracksLocalChild = isTrackedLocalChildProcessAdapter(adapterType);
|
||||
if (!tracksLocalChild) continue;
|
||||
|
||||
// Determine the most recent activity timestamp
|
||||
const lastActivity = run.lastOutputAt
|
||||
? new Date(run.lastOutputAt).getTime()
|
||||
: run.startedAt
|
||||
? new Date(run.startedAt).getTime()
|
||||
: new Date(run.createdAt).getTime();
|
||||
const idleMs = now.getTime() - lastActivity;
|
||||
|
||||
if (idleMs >= IDLE_KILL_THRESHOLD_MS) {
|
||||
// Kill: SIGTERM then SIGKILL
|
||||
const tracked = runningProcesses.get(run.id);
|
||||
const pid = tracked?.child?.pid ?? run.processPid;
|
||||
const killMessage = `Idle timeout: no output for ${Math.round(idleMs / 60_000)} minutes — killing process (pid ${pid ?? "unknown"})`;
|
||||
logger.warn({ runId: run.id, pid, idleMs }, killMessage);
|
||||
|
||||
if (tracked?.child) {
|
||||
tracked.child.kill("SIGTERM");
|
||||
setTimeout(() => {
|
||||
try { if (!tracked.child.killed) tracked.child.kill("SIGKILL"); } catch {}
|
||||
}, IDLE_KILL_GRACE_SEC * 1000);
|
||||
} else if (pid && isProcessAlive(pid)) {
|
||||
try { process.kill(pid, "SIGTERM"); } catch {}
|
||||
setTimeout(() => {
|
||||
try { if (isProcessAlive(pid)) process.kill(pid, "SIGKILL"); } catch {}
|
||||
}, IDLE_KILL_GRACE_SEC * 1000);
|
||||
}
|
||||
|
||||
// Mark the run as failed
|
||||
let killedRun = await setRunStatus(run.id, "failed", {
|
||||
error: killMessage,
|
||||
errorCode: "idle_timeout",
|
||||
finishedAt: now,
|
||||
});
|
||||
await setWakeupStatus(run.wakeupRequestId, "failed", {
|
||||
finishedAt: now,
|
||||
error: killMessage,
|
||||
});
|
||||
if (!killedRun) killedRun = await getRun(run.id);
|
||||
if (killedRun) {
|
||||
await appendRunEvent(killedRun, await nextRunEventSeq(killedRun.id), {
|
||||
eventType: "lifecycle",
|
||||
stream: "system",
|
||||
level: "error",
|
||||
message: killMessage,
|
||||
payload: { pid, idleMs },
|
||||
});
|
||||
await releaseIssueExecutionAndPromote(killedRun);
|
||||
}
|
||||
|
||||
await finalizeAgentStatus(run.agentId, "failed");
|
||||
await startNextQueuedRunForAgent(run.agentId);
|
||||
runningProcesses.delete(run.id);
|
||||
idleKilled.push(run.id);
|
||||
} else if (idleMs >= IDLE_WARNING_THRESHOLD_MS && run.errorCode !== IDLE_WARNING_ERROR_CODE) {
|
||||
// Warning: approaching idle timeout
|
||||
const warnMessage = `Idle warning: no output for ${Math.round(idleMs / 60_000)} minutes — will be killed at ${IDLE_KILL_THRESHOLD_MS / 60_000} min`;
|
||||
logger.warn({ runId: run.id, idleMs }, warnMessage);
|
||||
|
||||
const warnedRun = await setRunStatus(run.id, "running", {
|
||||
error: warnMessage,
|
||||
errorCode: IDLE_WARNING_ERROR_CODE,
|
||||
});
|
||||
if (warnedRun) {
|
||||
await appendRunEvent(warnedRun, await nextRunEventSeq(warnedRun.id), {
|
||||
eventType: "lifecycle",
|
||||
stream: "system",
|
||||
level: "warn",
|
||||
message: warnMessage,
|
||||
payload: { idleMs },
|
||||
});
|
||||
}
|
||||
idleWarned.push(run.id);
|
||||
}
|
||||
}
|
||||
|
||||
if (idleWarned.length > 0) {
|
||||
logger.info({ count: idleWarned.length, runIds: idleWarned }, "idle-warned runs");
|
||||
}
|
||||
if (idleKilled.length > 0) {
|
||||
logger.warn({ count: idleKilled.length, runIds: idleKilled }, "idle-killed runs");
|
||||
}
|
||||
|
||||
return { reaped: reaped.length, runIds: reaped, idleWarned: idleWarned.length, idleKilled: idleKilled.length };
|
||||
}
|
||||
|
||||
async function resumeQueuedRuns() {
|
||||
|
|
@ -2397,12 +2508,30 @@ export function heartbeatService(db: Db) {
|
|||
.where(eq(heartbeatRuns.id, runId));
|
||||
|
||||
const currentUserRedactionOptions = await getCurrentUserRedactionOptions();
|
||||
let lastOutputAtFlushPending = false;
|
||||
const onLog = async (stream: "stdout" | "stderr", chunk: string) => {
|
||||
const sanitizedChunk = redactCurrentUserText(chunk, currentUserRedactionOptions);
|
||||
if (stream === "stdout") stdoutExcerpt = appendExcerpt(stdoutExcerpt, sanitizedChunk);
|
||||
if (stream === "stderr") stderrExcerpt = appendExcerpt(stderrExcerpt, sanitizedChunk);
|
||||
const ts = new Date().toISOString();
|
||||
|
||||
// Batch lastOutputAt writes — flush at most once per 30 seconds to avoid DB churn
|
||||
if (!lastOutputAtFlushPending) {
|
||||
lastOutputAtFlushPending = true;
|
||||
setTimeout(() => {
|
||||
lastOutputAtFlushPending = false;
|
||||
db.update(heartbeatRuns)
|
||||
.set({ lastOutputAt: new Date(), updatedAt: new Date() })
|
||||
.where(eq(heartbeatRuns.id, runId))
|
||||
.then(() => {})
|
||||
.catch((err) => logger.warn({ err, runId }, "failed to flush lastOutputAt"));
|
||||
}, 30_000);
|
||||
// Also flush immediately on first output
|
||||
await db.update(heartbeatRuns)
|
||||
.set({ lastOutputAt: new Date(ts), updatedAt: new Date() })
|
||||
.where(eq(heartbeatRuns.id, runId));
|
||||
}
|
||||
|
||||
if (handle) {
|
||||
await runLogStore.append(handle, {
|
||||
stream,
|
||||
|
|
|
|||
|
|
@ -23,6 +23,8 @@ export interface LiveRunForIssue {
|
|||
agentId: string;
|
||||
agentName: string;
|
||||
adapterType: string;
|
||||
errorCode?: string | null;
|
||||
lastOutputAt?: string | null;
|
||||
issueId?: string | null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -92,9 +92,11 @@ function AgentRunCard({
|
|||
return (
|
||||
<div className={cn(
|
||||
"flex h-[320px] flex-col overflow-hidden rounded-xl border shadow-sm",
|
||||
isActive
|
||||
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
|
||||
: "border-border bg-background/70",
|
||||
run.errorCode === "idle_warning"
|
||||
? "border-orange-500/25 bg-orange-500/[0.04] shadow-[0_16px_40px_rgba(249,115,22,0.08)]"
|
||||
: isActive
|
||||
? "border-cyan-500/25 bg-cyan-500/[0.04] shadow-[0_16px_40px_rgba(6,182,212,0.08)]"
|
||||
: "border-border bg-background/70",
|
||||
)}>
|
||||
<div className="border-b border-border/60 px-3 py-3">
|
||||
<div className="flex items-start justify-between gap-2">
|
||||
|
|
@ -112,6 +114,11 @@ function AgentRunCard({
|
|||
</div>
|
||||
<div className="mt-2 flex items-center gap-2 text-[11px] text-muted-foreground">
|
||||
<span>{isActive ? "Live now" : run.finishedAt ? `Finished ${relativeTime(run.finishedAt)}` : `Started ${relativeTime(run.createdAt)}`}</span>
|
||||
{run.errorCode === "idle_warning" && (
|
||||
<span className="rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-1.5 py-0.5 text-[10px] font-medium text-orange-700 dark:text-orange-300">
|
||||
Idle
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import { useQuery, useQueryClient } from "@tanstack/react-query";
|
|||
import { heartbeatsApi, type LiveRunForIssue } from "../api/heartbeats";
|
||||
import { queryKeys } from "../lib/queryKeys";
|
||||
import { formatDateTime } from "../lib/utils";
|
||||
import { ExternalLink, Square } from "lucide-react";
|
||||
import { AlertTriangle, ExternalLink, Square } from "lucide-react";
|
||||
import { Identity } from "./Identity";
|
||||
import { StatusBadge } from "./StatusBadge";
|
||||
import { RunTranscriptView } from "./transcript/RunTranscriptView";
|
||||
|
|
@ -24,6 +24,15 @@ function isRunActive(status: string): boolean {
|
|||
return status === "queued" || status === "running";
|
||||
}
|
||||
|
||||
function formatRelativeTime(iso: string): string {
|
||||
const ms = Date.now() - new Date(iso).getTime();
|
||||
if (ms < 60_000) return "just now";
|
||||
const min = Math.floor(ms / 60_000);
|
||||
if (min < 60) return `${min}m ago`;
|
||||
const hr = Math.floor(min / 60);
|
||||
return `${hr}h ${min % 60}m ago`;
|
||||
}
|
||||
|
||||
export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
|
||||
const queryClient = useQueryClient();
|
||||
const [cancellingRunIds, setCancellingRunIds] = useState(new Set<string>());
|
||||
|
|
@ -116,7 +125,18 @@ export function LiveRunWidget({ issueId, companyId }: LiveRunWidgetProps) {
|
|||
{run.id.slice(0, 8)}
|
||||
</Link>
|
||||
<StatusBadge status={run.status} />
|
||||
<span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>
|
||||
{run.errorCode === "idle_warning" && (
|
||||
<span className="inline-flex items-center gap-1 rounded-full border border-orange-500/20 bg-orange-500/[0.06] px-2 py-0.5 text-[11px] font-medium text-orange-700 dark:text-orange-300">
|
||||
<AlertTriangle className="h-3 w-3" />
|
||||
Idle
|
||||
</span>
|
||||
)}
|
||||
{run.lastOutputAt && (
|
||||
<span title={`Last output: ${formatDateTime(run.lastOutputAt)}`}>
|
||||
Last output {formatRelativeTime(run.lastOutputAt)}
|
||||
</span>
|
||||
)}
|
||||
{!run.lastOutputAt && <span>{formatDateTime(run.startedAt ?? run.createdAt)}</span>}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ export const statusBadge: Record<string, string> = {
|
|||
// Run statuses
|
||||
failed: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
|
||||
timed_out: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
|
||||
idle_timeout: "bg-orange-100 text-orange-700 dark:bg-orange-900/50 dark:text-orange-300",
|
||||
succeeded: "bg-green-100 text-green-700 dark:bg-green-900/50 dark:text-green-300",
|
||||
error: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
|
||||
terminated: "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue