Auto rerun transient CI failures #1495
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Analyzes failed CI PR runs for retry-safe transient failures and requests reruns for | |
| # the matched jobs through GitHub's job-rerun API, which also reruns dependent jobs. | |
| # For the supported behaviors and safety rails, see | |
| # docs/ci/auto-rerun-transient-ci-failures.md. | |
| name: Auto rerun transient CI failures | |
| on: | |
| workflow_run: | |
| workflows: ["CI"] | |
| types: | |
| - completed | |
| workflow_dispatch: | |
| inputs: | |
| run_id: | |
| description: 'CI workflow run ID to inspect' | |
| required: true | |
| type: number | |
| dry_run: | |
| description: 'Inspect and summarize without requesting reruns' | |
| required: false | |
| default: false | |
| type: boolean | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && inputs.run_id || github.event.workflow_run.id }} | |
| cancel-in-progress: false | |
| jobs: | |
| analyze-transient-failures: | |
| name: Analyze transient CI failures | |
| if: >- | |
| ${{ | |
| github.repository_owner == 'microsoft' && | |
| (github.event_name == 'workflow_dispatch' || | |
| (github.event.workflow_run.event == 'pull_request' && | |
| github.event.workflow_run.conclusion == 'failure' && | |
| github.event.workflow_run.run_attempt <= 3)) | |
| }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: read | |
| checks: read | |
| contents: read | |
| outputs: | |
| source_run_id: ${{ steps.analyze.outputs.source_run_id }} | |
| source_run_attempt: ${{ steps.analyze.outputs.source_run_attempt }} | |
| source_run_url: ${{ steps.analyze.outputs.source_run_url }} | |
| retryable_jobs: ${{ steps.analyze.outputs.retryable_jobs }} | |
| pull_request_numbers: ${{ steps.analyze.outputs.pull_request_numbers }} | |
| retryable_count: ${{ steps.analyze.outputs.retryable_count }} | |
| skipped_count: ${{ steps.analyze.outputs.skipped_count }} | |
| rerun_eligible: ${{ steps.analyze.outputs.rerun_eligible }} | |
| rerun_execution_eligible: ${{ steps.analyze.outputs.rerun_execution_eligible }} | |
| dry_run: ${{ steps.analyze.outputs.dry_run }} | |
| max_retryable_jobs: ${{ steps.analyze.outputs.max_retryable_jobs }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Analyze failed jobs | |
| id: analyze | |
| uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| MANUAL_RUN_ID: ${{ inputs.run_id }} | |
| MANUAL_DRY_RUN: ${{ inputs.dry_run }} | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const isWorkflowDispatch = context.eventName === 'workflow_dispatch'; | |
| const maxRetryableJobs = rerunWorkflow.defaultMaxRetryableJobs; | |
| const maxJobLogInspectionBytes = 256 * 1024; | |
| async function paginate(route, parameters, selectItems) { | |
| const items = []; | |
| for (let page = 1; ; page++) { | |
| const response = await github.request(route, { | |
| ...parameters, | |
| per_page: 100, | |
| page, | |
| }); | |
| items.push(...selectItems(response.data)); | |
| if (!response.headers.link || !response.headers.link.includes('rel="next"')) { | |
| return items; | |
| } | |
| } | |
| } | |
| async function getWorkflowRun() { | |
| if (!isWorkflowDispatch) { | |
| return context.payload.workflow_run; | |
| } | |
| const runId = Number(process.env.MANUAL_RUN_ID); | |
| if (!Number.isInteger(runId) || runId <= 0) { | |
| throw new Error('workflow_dispatch requires a valid run_id input.'); | |
| } | |
| const response = await github.rest.actions.getWorkflowRun({ | |
| owner, | |
| repo, | |
| run_id: runId, | |
| }); | |
| return response.data; | |
| } | |
| function parseManualDryRun() { | |
| if (!isWorkflowDispatch) { | |
| return false; | |
| } | |
| return String(process.env.MANUAL_DRY_RUN).toLowerCase() === 'true'; | |
| } | |
| async function listJobsForAttempt(runId, attemptNumber) { | |
| return paginate( | |
| 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/attempts/{attempt_number}/jobs', | |
| { | |
| owner, | |
| repo, | |
| run_id: runId, | |
| attempt_number: attemptNumber, | |
| }, | |
| data => data.jobs || []); | |
| } | |
| async function listAnnotations(job) { | |
| try { | |
| const checkRunId = await rerunWorkflow.getCheckRunIdForJob({ | |
| job, | |
| getJobForWorkflowRun: async jobId => { | |
| const response = await github.rest.actions.getJobForWorkflowRun({ | |
| owner, | |
| repo, | |
| job_id: jobId, | |
| }); | |
| return response.data; | |
| }, | |
| }); | |
| if (!checkRunId) { | |
| core.warning(`Unable to resolve a check run id for job ${job.id}.`); | |
| return []; | |
| } | |
| return await paginate( | |
| 'GET /repos/{owner}/{repo}/check-runs/{check_run_id}/annotations', | |
| { | |
| owner, | |
| repo, | |
| check_run_id: checkRunId, | |
| }, | |
| data => Array.isArray(data) ? data : []); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to list annotations for job ${job.id}: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function getJobLogText(jobId) { | |
| try { | |
| const response = await fetch(`https://api.github.com/repos/${owner}/${repo}/actions/jobs/${jobId}/logs`, { | |
| headers: { | |
| authorization: `Bearer ${process.env.GITHUB_TOKEN}`, | |
| accept: 'application/vnd.github+json', | |
| 'x-github-api-version': '2022-11-28', | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`HTTP ${response.status}`); | |
| } | |
| return (await response.text()).slice(-maxJobLogInspectionBytes); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to fetch logs for job ${jobId}: ${error.message}`); | |
| return ''; | |
| } | |
| } | |
| const workflowRun = await getWorkflowRun(); | |
| const dryRun = parseManualDryRun(); | |
| const sourceRunUrl = workflowRun.html_url || `https://github.com/${owner}/${repo}/actions/runs/${workflowRun.id}`; | |
| core.setOutput('source_run_id', String(workflowRun.id)); | |
| core.setOutput('source_run_attempt', String(workflowRun.run_attempt || '')); | |
| core.setOutput('source_run_url', sourceRunUrl); | |
| core.setOutput('dry_run', String(dryRun)); | |
| core.setOutput('max_retryable_jobs', String(maxRetryableJobs)); | |
| core.setOutput('retryable_jobs', '[]'); | |
| core.setOutput('pull_request_numbers', '[]'); | |
| core.setOutput('retryable_count', '0'); | |
| core.setOutput('skipped_count', '0'); | |
| core.setOutput('rerun_eligible', 'false'); | |
| core.setOutput('rerun_execution_eligible', 'false'); | |
| if (workflowRun.name && workflowRun.name !== 'CI') { | |
| console.log(`Workflow run ${workflowRun.id} is '${workflowRun.name}', not 'CI'. Skipping.`); | |
| return; | |
| } | |
| const pullRequestNumbers = await rerunWorkflow.getAssociatedPullRequestNumbers({ | |
| github, | |
| owner, | |
| repo, | |
| workflowRun, | |
| warn: message => core.warning(message), | |
| }); | |
| core.setOutput('pull_request_numbers', JSON.stringify(pullRequestNumbers)); | |
| if (pullRequestNumbers.length === 0) { | |
| console.log('No associated pull request could be resolved for this workflow run. Skipping.'); | |
| return; | |
| } | |
| const runId = workflowRun.id; | |
| const runAttempt = workflowRun.run_attempt; | |
| const jobs = await listJobsForAttempt(runId, runAttempt); | |
| const { failedJobs, retryableJobs, skippedJobs } = await rerunWorkflow.analyzeFailedJobs({ | |
| jobs, | |
| getAnnotationsForJob: async job => listAnnotations(job), | |
| getJobLogTextForJob: async job => getJobLogText(job.id), | |
| maxRetryableJobs, | |
| }); | |
| core.setOutput('retryable_jobs', JSON.stringify(retryableJobs.map(job => ({ | |
| id: job.id, | |
| name: job.name, | |
| htmlUrl: job.htmlUrl, | |
| reason: job.reason, | |
| })))); | |
| core.setOutput('retryable_count', String(retryableJobs.length)); | |
| core.setOutput('skipped_count', String(skippedJobs.length)); | |
| const rerunEligible = rerunWorkflow.computeRerunEligibility({ | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| const rerunExecutionEligible = rerunWorkflow.computeRerunExecutionEligibility({ | |
| dryRun, | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| core.setOutput('rerun_eligible', String(rerunEligible)); | |
| core.setOutput('rerun_execution_eligible', String(rerunExecutionEligible)); | |
| await rerunWorkflow.writeAnalysisSummary({ | |
| summary: core.summary, | |
| failedJobs, | |
| retryableJobs, | |
| skippedJobs, | |
| maxRetryableJobs, | |
| dryRun, | |
| rerunEligible, | |
| sourceRunUrl, | |
| sourceRunAttempt: runAttempt, | |
| }); | |
| if (retryableJobs.length === 0) { | |
| console.log('No retryable failed jobs were detected.'); | |
| return; | |
| } | |
| rerun-transient-failures: | |
| name: Rerun transient CI failures | |
| needs: [analyze-transient-failures] | |
| if: ${{ needs.analyze-transient-failures.outputs.rerun_execution_eligible == 'true' }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| contents: read | |
| issues: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Rerun matched jobs | |
| uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 | |
| env: | |
| RETRYABLE_JOBS: ${{ needs.analyze-transient-failures.outputs.retryable_jobs }} | |
| PULL_REQUEST_NUMBERS: ${{ needs.analyze-transient-failures.outputs.pull_request_numbers }} | |
| SOURCE_RUN_ID: ${{ needs.analyze-transient-failures.outputs.source_run_id }} | |
| SOURCE_RUN_ATTEMPT: ${{ needs.analyze-transient-failures.outputs.source_run_attempt }} | |
| SOURCE_RUN_URL: ${{ needs.analyze-transient-failures.outputs.source_run_url }} | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const retryableJobs = JSON.parse(process.env.RETRYABLE_JOBS || '[]'); | |
| const pullRequestNumbers = JSON.parse(process.env.PULL_REQUEST_NUMBERS || '[]'); | |
| const sourceRunId = Number(process.env.SOURCE_RUN_ID); | |
| const sourceRunAttempt = Number(process.env.SOURCE_RUN_ATTEMPT); | |
| const sourceRunUrl = process.env.SOURCE_RUN_URL; | |
| if (retryableJobs.length === 0) { | |
| console.log('No retryable jobs were provided to the rerun job.'); | |
| return; | |
| } | |
| await rerunWorkflow.rerunMatchedJobs({ | |
| github, | |
| owner, | |
| repo, | |
| retryableJobs, | |
| pullRequestNumbers, | |
| summary: core.summary, | |
| sourceRunId, | |
| sourceRunAttempt, | |
| sourceRunUrl, | |
| }); |