diff --git a/package.json b/package.json index 749cc8d0..9433fbeb 100644 --- a/package.json +++ b/package.json @@ -32,7 +32,8 @@ "test:e2e:headed": "npx playwright test --config tests/e2e/playwright.config.ts --headed", "evals:smoke": "cd evals/promptfoo && npx promptfoo@0.103.3 eval", "test:release-smoke": "npx playwright test --config tests/release-smoke/playwright.config.ts", - "test:release-smoke:headed": "npx playwright test --config tests/release-smoke/playwright.config.ts --headed" + "test:release-smoke:headed": "npx playwright test --config tests/release-smoke/playwright.config.ts --headed", + "metrics:paperclip-commits": "tsx scripts/paperclip-commit-metrics.ts" }, "devDependencies": { "@playwright/test": "^1.58.2", diff --git a/scripts/paperclip-commit-metrics.ts b/scripts/paperclip-commit-metrics.ts new file mode 100644 index 00000000..e23cff2f --- /dev/null +++ b/scripts/paperclip-commit-metrics.ts @@ -0,0 +1,712 @@ +#!/usr/bin/env npx tsx + +import { execFile } from "node:child_process"; +import { promises as fs } from "node:fs"; +import path from "node:path"; +import { promisify } from "node:util"; + +const execFileAsync = promisify(execFile); + +const DEFAULT_QUERY = "\"Co-Authored-By: Paperclip \""; +const DEFAULT_CACHE_FILE = path.resolve("data/paperclip-commit-metrics-cache.json"); +const DEFAULT_SEARCH_START = "2008-01-01T00:00:00Z"; +const SEARCH_WINDOW_LIMIT = 900; +const MIN_WINDOW_MS = 60_000; +const DEFAULT_STATS_FETCH_LIMIT = 250; +const DEFAULT_STATS_CONCURRENCY = 4; +const DEFAULT_SEARCH_FIELD = "committer-date"; +const PAPERCLIP_EMAIL = "noreply@paperclip.ing"; +const PAPERCLIP_NAME = "paperclip"; + +interface CliOptions { + cacheFile: string; + end: Date; + includePrivate: boolean; + json: boolean; + query: string; + refreshSearch: boolean; + refreshStats: boolean; + searchField: "author-date" | "committer-date"; + start: Date; + statsConcurrency: number; + statsFetchLimit: number; + skipStats: boolean; +} + +interface SearchCommitItem { + author: { + login?: string; + } | null; + commit: { + author: { + date: string; + email: string | null; + name: string | null; + } | null; + message: string; + }; + html_url: string; + repository: { + full_name: string; + html_url: string; + }; + sha: string; +} + +interface CommitStats { + additions: number; + deletions: number; + total: number; +} + +interface CachedCommit { + authorEmail: string | null; + authorLogin: string | null; + authorName: string | null; + committedAt: string | null; + contributors: ContributorRecord[]; + htmlUrl: string; + repositoryFullName: string; + repositoryUrl: string; + sha: string; +} + +interface CachedCommitStats extends CommitStats { + fetchedAt: string; +} + +interface ContributorRecord { + displayName: string; + email: string | null; + key: string; + login: string | null; +} + +interface WindowCacheEntry { + completedAt: string; + key: string; + shas: string[]; + totalCount: number; +} + +interface CacheFile { + commits: Record; + queryKey: string; + searchField: CliOptions["searchField"]; + stats: Record; + updatedAt: string | null; + version: number; + windows: Record; +} + +interface SearchResponse { + incomplete_results: boolean; + items: SearchCommitItem[]; + total_count: number; +} + +interface SearchWindowResult { + shas: Set; + totalCount: number; +} + +interface Summary { + cacheFile: string; + contributors: { + count: number; + sample: ContributorRecord[]; + }; + detectedQuery: string; + lineStats: { + additions: number; + complete: boolean; + coveredCommits: number; + deletions: number; + missingCommits: number; + totalChanges: number; + }; + range: { + end: string; + searchField: CliOptions["searchField"]; + start: string; + }; + repos: { + count: number; + sample: string[]; + }; + statsFetch: { + fetchedThisRun: number; + skipped: boolean; + }; + totals: { + commits: number; + }; +} + +async function main() { + const options = parseArgs(process.argv.slice(2)); + const cache = await loadCache(options.cacheFile, options); + const client = new GitHubClient(await resolveGitHubToken()); + + const { shas } = await searchWindow(client, cache, options, options.start, options.end); + const sortedShas = [...shas].sort(); + + let fetchedThisRun = 0; + if (!options.skipStats) { + fetchedThisRun = await enrichCommitStats(client, cache, options, sortedShas); + } + + cache.updatedAt = new Date().toISOString(); + await saveCache(options.cacheFile, cache); + + const summary = buildSummary(cache, options, sortedShas, fetchedThisRun); + if (options.json) { + console.log(JSON.stringify(summary, null, 2)); + return; + } + + printSummary(summary); +} + +function parseArgs(argv: string[]): CliOptions { + const options: CliOptions = { + cacheFile: DEFAULT_CACHE_FILE, + end: new Date(), + includePrivate: false, + json: false, + query: DEFAULT_QUERY, + refreshSearch: false, + refreshStats: false, + searchField: DEFAULT_SEARCH_FIELD, + start: new Date(DEFAULT_SEARCH_START), + statsConcurrency: DEFAULT_STATS_CONCURRENCY, + statsFetchLimit: DEFAULT_STATS_FETCH_LIMIT, + skipStats: false, + }; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + switch (arg) { + case "--cache-file": + options.cacheFile = requireValue(argv, ++index, arg); + break; + case "--end": + options.end = parseDateArg(requireValue(argv, ++index, arg), arg); + break; + case "--include-private": + options.includePrivate = true; + break; + case "--json": + options.json = true; + break; + case "--query": + options.query = requireValue(argv, ++index, arg); + break; + case "--refresh-search": + options.refreshSearch = true; + break; + case "--refresh-stats": + options.refreshStats = true; + break; + case "--search-field": { + const value = requireValue(argv, ++index, arg); + if (value !== "author-date" && value !== "committer-date") { + throw new Error(`Invalid --search-field value: ${value}`); + } + options.searchField = value; + break; + } + case "--skip-stats": + options.skipStats = true; + break; + case "--start": + options.start = parseDateArg(requireValue(argv, ++index, arg), arg); + break; + case "--stats-concurrency": + options.statsConcurrency = parsePositiveInt(requireValue(argv, ++index, arg), arg); + break; + case "--stats-fetch-limit": + options.statsFetchLimit = parseNonNegativeInt(requireValue(argv, ++index, arg), arg); + break; + case "--help": + printHelp(); + process.exit(0); + break; + default: + throw new Error(`Unknown argument: ${arg}`); + } + } + + if (Number.isNaN(options.start.getTime()) || Number.isNaN(options.end.getTime())) { + throw new Error("Invalid start or end date"); + } + if (options.start >= options.end) { + throw new Error("--start must be earlier than --end"); + } + + return options; +} + +function requireValue(argv: string[], index: number, flag: string): string { + const value = argv[index]; + if (!value) { + throw new Error(`Missing value for ${flag}`); + } + return value; +} + +function parseDateArg(value: string, flag: string): Date { + const parsed = new Date(value); + if (Number.isNaN(parsed.getTime())) { + throw new Error(`Invalid date for ${flag}: ${value}`); + } + return parsed; +} + +function parsePositiveInt(value: string, flag: string): number { + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error(`Invalid positive integer for ${flag}: ${value}`); + } + return parsed; +} + +function parseNonNegativeInt(value: string, flag: string): number { + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed < 0) { + throw new Error(`Invalid non-negative integer for ${flag}: ${value}`); + } + return parsed; +} + +function printHelp() { + console.log(`Usage: tsx scripts/paperclip-commit-metrics.ts [options] + +Options: + --start ISO date/time lower bound (default: ${DEFAULT_SEARCH_START}) + --end ISO date/time upper bound (default: now) + --query Commit search string (default: ${DEFAULT_QUERY}) + --search-field author-date | committer-date (default: ${DEFAULT_SEARCH_FIELD}) + --include-private Include repos visible to the current token + --cache-file Cache path (default: ${DEFAULT_CACHE_FILE}) + --skip-stats Skip additions/deletions enrichment + --stats-fetch-limit Max uncached commit stats to fetch this run (default: ${DEFAULT_STATS_FETCH_LIMIT}) + --stats-concurrency Parallel commit stat requests (default: ${DEFAULT_STATS_CONCURRENCY}) + --refresh-search Ignore cached search windows + --refresh-stats Re-fetch cached commit stats + --json Print JSON summary + --help Show this help +`); +} + +async function resolveGitHubToken(): Promise { + const envToken = process.env.GITHUB_TOKEN ?? process.env.GH_TOKEN; + if (envToken) { + return envToken; + } + + const { stdout } = await execFileAsync("gh", ["auth", "token"]); + const token = stdout.trim(); + if (!token) { + throw new Error("Unable to resolve a GitHub token. Set GITHUB_TOKEN/GH_TOKEN or run `gh auth login`."); + } + return token; +} + +async function loadCache(cacheFile: string, options: CliOptions): Promise { + try { + const raw = await fs.readFile(cacheFile, "utf8"); + const parsed = JSON.parse(raw) as CacheFile; + if (parsed.version !== 1 || parsed.queryKey !== buildQueryKey(options) || parsed.searchField !== options.searchField) { + return createEmptyCache(options); + } + return parsed; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return createEmptyCache(options); + } + throw error; + } +} + +function createEmptyCache(options: CliOptions): CacheFile { + return { + commits: {}, + queryKey: buildQueryKey(options), + searchField: options.searchField, + stats: {}, + updatedAt: null, + version: 1, + windows: {}, + }; +} + +function buildQueryKey(options: CliOptions): string { + const visibility = options.includePrivate ? "all" : "public"; + return JSON.stringify({ + query: options.query, + searchField: options.searchField, + visibility, + }); +} + +async function saveCache(cacheFile: string, cache: CacheFile): Promise { + await fs.mkdir(path.dirname(cacheFile), { recursive: true }); + await fs.writeFile(cacheFile, JSON.stringify(cache, null, 2), "utf8"); +} + +async function searchWindow( + client: GitHubClient, + cache: CacheFile, + options: CliOptions, + start: Date, + end: Date, +): Promise { + const windowKey = makeWindowKey(start, end); + if (!options.refreshSearch) { + const cached = cache.windows[windowKey]; + if (cached) { + return { shas: new Set(cached.shas), totalCount: cached.totalCount }; + } + } + + const firstPage = await searchPage(client, options, start, end, 1, 100); + if (firstPage.incomplete_results) { + throw new Error(`GitHub returned incomplete search results for window ${windowKey}`); + } + + if (firstPage.total_count > SEARCH_WINDOW_LIMIT) { + const durationMs = end.getTime() - start.getTime(); + if (durationMs <= MIN_WINDOW_MS) { + throw new Error( + `Search window ${windowKey} still has ${firstPage.total_count} results after splitting to ${durationMs}ms.`, + ); + } + + const midpoint = new Date(start.getTime() + Math.floor(durationMs / 2)); + const left = await searchWindow(client, cache, options, start, midpoint); + const right = await searchWindow(client, cache, options, new Date(midpoint.getTime() + 1), end); + const shas = new Set([...left.shas, ...right.shas]); + + cache.windows[windowKey] = { + completedAt: new Date().toISOString(), + key: windowKey, + shas: [...shas], + totalCount: shas.size, + }; + + return { shas, totalCount: shas.size }; + } + + const pageCount = Math.ceil(firstPage.total_count / 100); + const shas = new Set(); + ingestSearchItems(cache, firstPage.items, shas); + + for (let page = 2; page <= pageCount; page += 1) { + const response = await searchPage(client, options, start, end, page, 100); + ingestSearchItems(cache, response.items, shas); + } + + cache.windows[windowKey] = { + completedAt: new Date().toISOString(), + key: windowKey, + shas: [...shas], + totalCount: firstPage.total_count, + }; + + return { shas, totalCount: firstPage.total_count }; +} + +async function searchPage( + client: GitHubClient, + options: CliOptions, + start: Date, + end: Date, + page: number, + perPage: number, +): Promise { + const searchQuery = buildSearchQuery(options, start, end); + const params = new URLSearchParams({ + page: String(page), + per_page: String(perPage), + q: searchQuery, + }); + + return client.getJson(`/search/commits?${params.toString()}`); +} + +function buildSearchQuery(options: CliOptions, start: Date, end: Date): string { + const qualifiers = [`${options.searchField}:${formatQueryDate(start)}..${formatQueryDate(end)}`]; + if (!options.includePrivate) { + qualifiers.push("is:public"); + } + return `${options.query} ${qualifiers.join(" ")}`.trim(); +} + +function formatQueryDate(value: Date): string { + return value.toISOString().replace(".000Z", "Z"); +} + +function ingestSearchItems(cache: CacheFile, items: SearchCommitItem[], shas: Set) { + for (const item of items) { + shas.add(item.sha); + cache.commits[item.sha] = { + authorEmail: item.commit.author?.email ?? null, + authorLogin: item.author?.login ?? null, + authorName: item.commit.author?.name ?? null, + committedAt: item.commit.author?.date ?? null, + contributors: extractContributors(item), + htmlUrl: item.html_url, + repositoryFullName: item.repository.full_name, + repositoryUrl: item.repository.html_url, + sha: item.sha, + }; + } +} + +function extractContributors(item: SearchCommitItem): ContributorRecord[] { + const contributors = new Map(); + + const primaryAuthor = normalizeContributor({ + email: item.commit.author?.email ?? null, + login: item.author?.login ?? null, + name: item.commit.author?.name ?? null, + }); + if (primaryAuthor) { + contributors.set(primaryAuthor.key, primaryAuthor); + } + + const coAuthorPattern = /^co-authored-by:\s*(.+?)\s*<([^>]+)>\s*$/gim; + for (const match of item.commit.message.matchAll(coAuthorPattern)) { + const contributor = normalizeContributor({ + email: match[2] ?? null, + login: null, + name: match[1] ?? null, + }); + if (contributor) { + contributors.set(contributor.key, contributor); + } + } + + return [...contributors.values()]; +} + +function normalizeContributor(input: { + email: string | null; + login: string | null; + name: string | null; +}): ContributorRecord | null { + const email = normalizeOptional(input.email); + const login = normalizeOptional(input.login); + const displayName = normalizeOptional(input.name) ?? login ?? email; + + if (!displayName && !email && !login) { + return null; + } + if ((email && email === PAPERCLIP_EMAIL) || (displayName && displayName.toLowerCase() === PAPERCLIP_NAME)) { + return null; + } + + const key = login ? `login:${login}` : email ? `email:${email}` : `name:${displayName!.toLowerCase()}`; + return { + displayName: displayName ?? email ?? login ?? "unknown", + email, + key, + login, + }; +} + +function normalizeOptional(value: string | null | undefined): string | null { + const trimmed = value?.trim(); + return trimmed ? trimmed : null; +} + +async function enrichCommitStats( + client: GitHubClient, + cache: CacheFile, + options: CliOptions, + shas: string[], +): Promise { + const pending = shas.filter((sha) => options.refreshStats || !cache.stats[sha]).slice(0, options.statsFetchLimit); + let nextIndex = 0; + let fetched = 0; + + const workers = Array.from({ length: Math.min(options.statsConcurrency, pending.length) }, async () => { + while (true) { + const currentIndex = nextIndex; + nextIndex += 1; + const sha = pending[currentIndex]; + if (!sha) { + return; + } + const commit = cache.commits[sha]; + if (!commit) { + continue; + } + const stats = await fetchCommitStats(client, commit.repositoryFullName, sha); + cache.stats[sha] = { + ...stats, + fetchedAt: new Date().toISOString(), + }; + fetched += 1; + } + }); + + await Promise.all(workers); + return fetched; +} + +async function fetchCommitStats(client: GitHubClient, repositoryFullName: string, sha: string): Promise { + const response = await client.getJson<{ stats?: CommitStats }>( + `/repos/${repositoryFullName}/commits/${sha}`, + ); + return { + additions: response.stats?.additions ?? 0, + deletions: response.stats?.deletions ?? 0, + total: response.stats?.total ?? 0, + }; +} + +function buildSummary(cache: CacheFile, options: CliOptions, shas: string[], fetchedThisRun: number): Summary { + const repoNames = new Set(); + const contributors = new Map(); + let additions = 0; + let deletions = 0; + let coveredCommits = 0; + + for (const sha of shas) { + const commit = cache.commits[sha]; + if (!commit) { + continue; + } + repoNames.add(commit.repositoryFullName); + for (const contributor of commit.contributors) { + contributors.set(contributor.key, contributor); + } + + const stats = cache.stats[sha]; + if (stats) { + additions += stats.additions; + deletions += stats.deletions; + coveredCommits += 1; + } + } + + const contributorSample = [...contributors.values()] + .sort((left, right) => left.displayName.localeCompare(right.displayName)) + .slice(0, 10); + const repoSample = [...repoNames].sort((left, right) => left.localeCompare(right)).slice(0, 10); + + return { + cacheFile: options.cacheFile, + contributors: { + count: contributors.size, + sample: contributorSample, + }, + detectedQuery: buildSearchQuery(options, options.start, options.end), + lineStats: { + additions, + complete: coveredCommits === shas.length, + coveredCommits, + deletions, + missingCommits: shas.length - coveredCommits, + totalChanges: additions + deletions, + }, + range: { + end: options.end.toISOString(), + searchField: options.searchField, + start: options.start.toISOString(), + }, + repos: { + count: repoNames.size, + sample: repoSample, + }, + statsFetch: { + fetchedThisRun, + skipped: options.skipStats, + }, + totals: { + commits: shas.length, + }, + }; +} + +function printSummary(summary: Summary) { + console.log("Paperclip commit metrics"); + console.log(`Query: ${summary.detectedQuery}`); + console.log(`Range: ${summary.range.start} -> ${summary.range.end} (${summary.range.searchField})`); + console.log(`Commits: ${summary.totals.commits}`); + console.log(`Distinct repos: ${summary.repos.count}`); + console.log(`Distinct contributors: ${summary.contributors.count}`); + console.log( + `Line stats: +${summary.lineStats.additions} / -${summary.lineStats.deletions} / ${summary.lineStats.totalChanges} total`, + ); + console.log( + `Line stat coverage: ${summary.lineStats.coveredCommits}/${summary.totals.commits}` + + (summary.lineStats.complete ? " (complete)" : " (partial; rerun to hydrate more commits)"), + ); + console.log(`Stats fetched this run: ${summary.statsFetch.fetchedThisRun}${summary.statsFetch.skipped ? " (skipped)" : ""}`); + console.log(`Cache: ${summary.cacheFile}`); + + if (summary.repos.sample.length > 0) { + console.log(`Sample repos: ${summary.repos.sample.join(", ")}`); + } + if (summary.contributors.sample.length > 0) { + console.log( + `Sample contributors: ${summary.contributors.sample + .map((contributor) => contributor.login ?? contributor.displayName) + .join(", ")}`, + ); + } +} + +function makeWindowKey(start: Date, end: Date): string { + return `${start.toISOString()}..${end.toISOString()}`; +} + +class GitHubClient { + private readonly apiBase = "https://api.github.com"; + private readonly token: string; + + constructor(token: string) { + this.token = token; + } + + async getJson(pathname: string): Promise { + while (true) { + const response = await fetch(`${this.apiBase}${pathname}`, { + headers: { + Accept: "application/vnd.github+json", + Authorization: `Bearer ${this.token}`, + "User-Agent": "paperclip-commit-metrics", + "X-GitHub-Api-Version": "2022-11-28", + }, + }); + + if (response.ok) { + return (await response.json()) as T; + } + + const remaining = response.headers.get("x-ratelimit-remaining"); + const resetAt = response.headers.get("x-ratelimit-reset"); + if ((response.status === 403 || response.status === 429) && remaining === "0" && resetAt) { + const waitMs = Math.max(Number.parseInt(resetAt, 10) * 1000 - Date.now() + 1_000, 1_000); + console.error(`GitHub rate limit hit for ${pathname}; waiting ${Math.ceil(waitMs / 1000)}s...`); + await sleep(waitMs); + continue; + } + + const body = await response.text(); + throw new Error(`GitHub API request failed (${response.status}) for ${pathname}: ${body}`); + } + } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +});