diff --git a/.github/workflows/auto-rerun-transient-ci-failures.js b/.github/workflows/auto-rerun-transient-ci-failures.js index 41a996ee9ac..83b4bb788a9 100644 --- a/.github/workflows/auto-rerun-transient-ci-failures.js +++ b/.github/workflows/auto-rerun-transient-ci-failures.js @@ -28,6 +28,11 @@ const ignoredFailureStepPatterns = [ /^Install dependencies$/i, ]; +const testExecutionFailureStepPatterns = [ + /^Run tests\b/i, + /^Run nuget dependent tests\b/i, +]; + const transientAnnotationPatterns = [ /The job was not acquired by Runner of type hosted even after multiple attempts/i, /The hosted runner lost communication with the server/i, @@ -72,25 +77,29 @@ const windowsProcessInitializationFailurePatterns = [ /\b0xC0000142\b/i, ]; -const feedNetworkFailureStepPatterns = [ - /^Install sdk for nuget based testing$/i, - /^Build test project$/i, - /^Build and archive test project$/i, - /^Build with packages$/i, - /^Build RID-specific packages\b/i, - /^Build .*validation image$/i, - /^Run .*SDK validation$/i, - /^Rebuild for Azure Functions project$/i, -]; - -const ignoredBuildFailureLogOverridePatterns = [ +const infrastructureNetworkFailureLogOverridePatterns = [ /Unable to load the service index for source https:\/\/(?:pkgs\.dev\.azure\.com\/dnceng|dnceng\.pkgs\.visualstudio\.com)\/public\/_packaging\//i, + /(timed out|failed to connect|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established).{0,160}https:\/\/(?:pkgs\.dev\.azure\.com\/dnceng|dnceng\.pkgs\.visualstudio\.com)\/public\/_packaging\//i, + /https:\/\/(?:pkgs\.dev\.azure\.com\/dnceng|dnceng\.pkgs\.visualstudio\.com)\/public\/_packaging\/.{0,160}(timed out|failed to connect|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established)/i, + /(timed out|failed to connect|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established).{0,160}builds\.dotnet\.microsoft\.com/i, + /builds\.dotnet\.microsoft\.com.{0,160}(timed out|failed to connect|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established)/i, + /(timed out|failed to connect|failed to respond|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established).{0,160}api\.github\.com/i, + /api\.github\.com.{0,160}(timed out|failed to connect|failed to respond|could not resolve|ENOTFOUND|ECONNRESET|EPROTO|Bad Gateway|SSL connection could not be established)/i, + /fatal: unable to access 'https:\/\/github\.com\/.*': The requested URL returned error:\s*(502|503|504)/i, + /Failed to connect to github\.com port/i, + /expected 'packfile'/i, + /\bRPC failed\b/i, + /\bRecv failure\b/i, ]; function matchesAny(value, patterns) { return patterns.some(pattern => pattern.test(value)); } +function findMatchingPattern(value, patterns) { + return patterns.find(pattern => pattern.test(value)) ?? null; +} + function parseCheckRunId(checkRunUrl) { if (typeof checkRunUrl !== 'string') { return null; @@ -154,16 +163,81 @@ function getFailureStepSignals(failedSteps) { }; } +function canUseInfrastructureNetworkLogOverride(failedSteps) { + return failedSteps.length > 0 && !failedSteps.some(step => matchesAny(step, testExecutionFailureStepPatterns)); +} + +function formatFailedStepLabel(failedSteps, failedStepText) { + const label = failedSteps.length === 1 ? 'Failed step' : 'Failed steps'; + return `${label} '${failedStepText}'`; +} + +function isSingleFailedStep(failedSteps) { + return failedSteps.length === 1; +} + +function getInfrastructureNetworkLogOverrideReason(failedStepText, matchedPattern) { + const patternText = matchedPattern ? ` Matched pattern: ${matchedPattern}.` : ''; + return `Failed step '${failedStepText}' will be retried because the job log shows a likely transient infrastructure network failure.${patternText}`; +} + +function getOutsideRetryRulesReason(failedSteps, failedStepText) { + return `${formatFailedStepLabel(failedSteps, failedStepText)} ${isSingleFailedStep(failedSteps) ? 'is' : 'are'} not covered by the retry-safe rerun rules.`; +} + +function getNoRetryMatchReason({ + failedSteps, + failedStepText, + hasRetryableStep, + hasIgnoredFailureStep, + hasTestExecutionFailureStep, + annotationsText, +}) { + const failedStepLabel = formatFailedStepLabel(failedSteps, failedStepText); + + if (hasTestExecutionFailureStep) { + return `${failedStepLabel} ${isSingleFailedStep(failedSteps) ? 'includes' : 'include'} a test execution failure, so the job was not retried without a high-confidence infrastructure override.`; + } + + if (hasIgnoredFailureStep) { + return `${failedStepLabel} ${isSingleFailedStep(failedSteps) ? 'is' : 'are'} only retried when the job shows a high-confidence infrastructure override, and none was found.`; + } + + if (hasRetryableStep) { + return `${failedStepLabel} did not include a retry-safe transient infrastructure signal in the job annotations.`; + } + + if (annotationsText) { + return 'The job annotations did not show a retry-safe transient infrastructure failure.'; + } + + return 'No retry-safe transient infrastructure signal was found in the job annotations or logs.'; +} + function classifyFailedJob(job, annotationsOrText, jobLogText = '') { const failedSteps = getFailedSteps(job); const failedStepText = failedSteps.join(' | '); const { hasRetryableStep, hasIgnoredFailureStep, shouldInspectAnnotations } = getFailureStepSignals(failedSteps); + const hasTestExecutionFailureStep = failedSteps.some(step => matchesAny(step, testExecutionFailureStepPatterns)); + const matchedInfrastructureNetworkLogOverridePattern = + !hasTestExecutionFailureStep + ? findMatchingPattern(jobLogText, infrastructureNetworkFailureLogOverridePatterns) + : null; + const matchesInfrastructureNetworkLogOverride = matchedInfrastructureNetworkLogOverridePattern !== null; if (!shouldInspectAnnotations) { + if (matchesInfrastructureNetworkLogOverride) { + return { + retryable: true, + failedSteps, + reason: getInfrastructureNetworkLogOverrideReason(failedStepText, matchedInfrastructureNetworkLogOverridePattern), + }; + } + return { retryable: false, failedSteps, - reason: 'Failed steps are outside the retry-safe allowlist.', + reason: getOutsideRetryRulesReason(failedSteps, failedStepText), }; } @@ -206,21 +280,25 @@ function classifyFailedJob(job, annotationsOrText, jobLogText = '') { }; } - const hasIgnoredBuildFailureStep = failedSteps.some(step => matchesAny(step, feedNetworkFailureStepPatterns)); - if (hasIgnoredBuildFailureStep && matchesAny(jobLogText, ignoredBuildFailureLogOverridePatterns)) { + if (matchesInfrastructureNetworkLogOverride) { return { retryable: true, failedSteps, - reason: `Ignored failed step '${failedStepText}' matched the feed network failure override allowlist.`, + reason: getInfrastructureNetworkLogOverrideReason(failedStepText, matchedInfrastructureNetworkLogOverridePattern), }; } return { retryable: false, failedSteps, - reason: annotationsText - ? 'Annotations did not match the transient allowlist.' - : 'No retry-safe step or annotation signature matched.', + reason: getNoRetryMatchReason({ + failedSteps, + failedStepText, + hasRetryableStep, + hasIgnoredFailureStep, + hasTestExecutionFailureStep, + annotationsText, + }), }; } @@ -243,7 +321,7 @@ async function analyzeFailedJobs({ jobs, getAnnotationsForJob, getJobLogTextForJ const shouldInspectLogs = !classification.retryable && getJobLogTextForJob && - failedSteps.some(step => matchesAny(step, feedNetworkFailureStepPatterns)); + canUseInfrastructureNetworkLogOverride(failedSteps); if (shouldInspectLogs) { classification = classifyFailedJob( @@ -276,6 +354,47 @@ function computeRerunEligibility({ dryRun, retryableCount, maxRetryableJobs = de return !dryRun && retryableCount > 0 && retryableCount <= maxRetryableJobs; } +function buildSummaryReference(url, text) { + return { url, text }; +} + +function addSummaryReference(summary, label, reference) { + summary.addRaw(`${label}: `); + + if (reference?.url) { + summary.addLink(reference.text, reference.url); + } + else { + summary.addRaw(reference?.text || 'not available'); + } + + return summary.addBreak(); +} + +function addSummaryCommentReferences(summary, postedComments) { + if (!postedComments?.length) { + summary.addRaw('Pull request comments: none posted').addBreak(); + return summary; + } + + summary.addRaw('Pull request comments:').addBreak(); + + for (const comment of postedComments) { + summary.addRaw('- '); + + if (comment.htmlUrl) { + summary.addLink(`PR #${comment.pullRequestNumber} comment`, comment.htmlUrl); + } + else { + summary.addRaw(`PR #${comment.pullRequestNumber} comment`); + } + + summary.addBreak(); + } + + return summary; +} + async function writeAnalysisSummary({ summary, failedJobs, @@ -285,9 +404,27 @@ async function writeAnalysisSummary({ dryRun, rerunEligible, sourceRunUrl, + sourceRunAttempt, }) { + const analyzedRunReference = buildSummaryReference( + buildWorkflowRunAttemptUrl(sourceRunUrl, sourceRunAttempt), + Number.isInteger(sourceRunAttempt) && sourceRunAttempt > 0 + ? `workflow run attempt ${sourceRunAttempt}` + : 'workflow run' + ); + const outcome = rerunEligible ? 'Rerun eligible' : 'Rerun skipped'; + const outcomeDetails = rerunEligible + ? `Matched ${retryableJobs.length} retry-safe job${retryableJobs.length === 1 ? '' : 's'} for rerun.` + : retryableJobs.length === 0 + ? 'No retry-safe jobs were found in the analyzed run.' + : dryRun + ? 'Dry run is enabled, so no rerun requests will be sent.' + : retryableJobs.length > maxRetryableJobs + ? `Matched ${retryableJobs.length} jobs, which exceeds the cap of ${maxRetryableJobs}.` + : 'The analyzed run did not satisfy the workflow safety rails for reruns.'; const summaryRows = [ [{ data: 'Category', header: true }, { data: 'Count', header: true }], + ['Outcome', outcome], ['Failed jobs inspected', String(failedJobs.length)], ['Retryable jobs', String(retryableJobs.length)], ['Skipped jobs', String(skippedJobs.length)], @@ -295,14 +432,15 @@ async function writeAnalysisSummary({ ['Dry run', String(dryRun)], ['Eligible to rerun', String(rerunEligible)], ]; - const sourceRunReference = sourceRunUrl - ? `[workflow run](${sourceRunUrl})` - : 'workflow run'; await summary - .addHeading('Transient CI rerun analysis') - .addTable(summaryRows) - .addRaw(`Source run: ${sourceRunReference}\n\n`); + .addHeading(outcome) + .addTable(summaryRows); + + addSummaryReference(summary, 'Analyzed run', analyzedRunReference) + .addRaw(outcomeDetails) + .addBreak() + .addBreak(); if (retryableJobs.length > 0) { await summary.addHeading('Retryable jobs', 2); @@ -320,12 +458,6 @@ async function writeAnalysisSummary({ ]); } - if (retryableJobs.length > maxRetryableJobs) { - await summary - .addHeading('Automatic rerun skipped', 2) - .addRaw(`Matched ${retryableJobs.length} jobs, which exceeds the cap of ${maxRetryableJobs}.`, true); - } - await summary.write(); } @@ -361,20 +493,17 @@ function buildWorkflowRunAttemptUrl(sourceRunUrl, runAttempt) { return `${sourceRunUrl.replace(/\/$/, '')}/attempts/${runAttempt}`; } -function buildWorkflowRunReferenceText(sourceRunUrl, runAttempt) { - const workflowRunUrl = buildWorkflowRunAttemptUrl(sourceRunUrl, runAttempt); - - if (!workflowRunUrl) { - return Number.isInteger(runAttempt) && runAttempt > 0 +function buildWorkflowRunReference(sourceRunUrl, runAttempt) { + return buildSummaryReference( + buildWorkflowRunAttemptUrl(sourceRunUrl, runAttempt), + Number.isInteger(runAttempt) && runAttempt > 0 ? `workflow run attempt ${runAttempt}` - : 'workflow run'; - } - - const label = Number.isInteger(runAttempt) && runAttempt > 0 - ? `workflow run attempt ${runAttempt}` - : 'workflow run'; + : 'workflow run' + ); +} - return `[${label}](${workflowRunUrl})`; +function formatMarkdownLink(text, url) { + return url ? `[${text}](${url})` : text; } async function getLatestRunAttempt({ github, owner, repo, runId }) { @@ -403,8 +532,8 @@ function buildPullRequestCommentBody({ retryableJobs, }) { return [ - `The transient CI rerun workflow requested reruns for the following jobs after analyzing [the failed attempt](${failedAttemptUrl}).`, - `GitHub's job rerun API also reruns dependent jobs, so the retry is being tracked in [the rerun attempt](${rerunAttemptUrl}).`, + `The transient CI rerun workflow requested reruns for the following jobs after analyzing ${formatMarkdownLink('the failed attempt', failedAttemptUrl)}.`, + `GitHub's job rerun API also reruns dependent jobs, so the retry is being tracked in ${formatMarkdownLink('the rerun attempt', rerunAttemptUrl)}.`, 'The job links below point to the failed attempt that matched the retry-safe transient failure rules.', '', ...retryableJobs.map(job => { @@ -418,14 +547,23 @@ function buildPullRequestCommentBody({ } async function addPullRequestComments({ github, owner, repo, pullRequestNumbers, body }) { + const postedComments = []; + for (const pullRequestNumber of pullRequestNumbers) { - await github.request('POST /repos/{owner}/{repo}/issues/{issue_number}/comments', { + const response = await github.request('POST /repos/{owner}/{repo}/issues/{issue_number}/comments', { owner, repo, issue_number: pullRequestNumber, body, }); + + postedComments.push({ + pullRequestNumber, + htmlUrl: response.data?.html_url || null, + }); } + + return postedComments; } async function rerunMatchedJobs({ @@ -451,9 +589,21 @@ async function rerunMatchedJobs({ }); if (pullRequestNumbers.length > 0 && openPullRequestNumbers.length === 0) { + const failedAttemptReference = buildWorkflowRunReference(sourceRunUrl, sourceRunAttempt); await summary - .addHeading('Automatic rerun skipped') - .addRaw('All associated pull requests are closed. No jobs were rerun.', true) + .addHeading('Rerun skipped'); + + addSummaryReference(summary, 'Analyzed run', failedAttemptReference) + .addRaw('All associated pull requests are closed. No jobs were rerun.') + .addBreak() + .addBreak(); + + await summary + .addHeading('Retryable jobs', 2) + .addTable([ + [{ data: 'Job', header: true }, { data: 'Reason', header: true }], + ...retryableJobs.map(job => [job.name, job.reason]), + ]) .write(); return; } @@ -480,41 +630,37 @@ async function rerunMatchedJobs({ ? latestRunAttempt : normalizedSourceRunAttempt ? normalizedSourceRunAttempt + 1 : null; const rerunAttemptUrl = buildWorkflowRunAttemptUrl(sourceRunUrl, rerunAttemptNumber); - const failedAttemptReference = buildWorkflowRunReferenceText(sourceRunUrl, normalizedSourceRunAttempt); - const rerunAttemptReference = buildWorkflowRunReferenceText(sourceRunUrl, rerunAttemptNumber); + const failedAttemptReference = buildWorkflowRunReference(sourceRunUrl, normalizedSourceRunAttempt); + const rerunAttemptReference = buildWorkflowRunReference(sourceRunUrl, rerunAttemptNumber); + let postedComments = []; if (openPullRequestNumbers.length > 0) { - await addPullRequestComments({ + postedComments = await addPullRequestComments({ github, owner, repo, pullRequestNumbers: openPullRequestNumbers, body: buildPullRequestCommentBody({ - failedAttemptUrl, - rerunAttemptUrl, + failedAttemptUrl: failedAttemptReference.url, + rerunAttemptUrl: rerunAttemptReference.url, retryableJobs, }), }); } - const commentedPullRequestsText = openPullRequestNumbers.length > 0 - ? openPullRequestNumbers.map(number => `#${number}`).join(', ') - : null; - const summaryBuilder = summary - .addHeading('Rerun requested') - .addRaw(`Failed attempt: ${failedAttemptReference}\nRerun attempt: ${rerunAttemptReference}\n\n`) + .addHeading('Rerun requested'); + + addSummaryReference(summaryBuilder, 'Failed attempt', failedAttemptReference); + addSummaryReference(summaryBuilder, 'Rerun attempt', rerunAttemptReference); + addSummaryCommentReferences(summaryBuilder, postedComments).addBreak(); + + summaryBuilder .addTable([ [{ data: 'Job', header: true }, { data: 'Reason', header: true }], ...retryableJobs.map(job => [job.name, job.reason]), ]); - if (commentedPullRequestsText) { - summaryBuilder - .addHeading('Pull request comments', 2) - .addRaw(`Posted rerun details to ${commentedPullRequestsText}.`, true); - } - await summaryBuilder.write(); } diff --git a/.github/workflows/auto-rerun-transient-ci-failures.yml b/.github/workflows/auto-rerun-transient-ci-failures.yml index 2613e71eb13..4f7b368b6e5 100644 --- a/.github/workflows/auto-rerun-transient-ci-failures.yml +++ b/.github/workflows/auto-rerun-transient-ci-failures.yml @@ -15,6 +15,11 @@ on: description: 'CI workflow run ID to inspect' required: true type: number + dry_run: + description: 'Inspect and summarize without requesting reruns' + required: false + default: false + type: boolean concurrency: group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && inputs.run_id || github.event.workflow_run.id }} @@ -57,6 +62,7 @@ jobs: env: GITHUB_TOKEN: ${{ github.token }} MANUAL_RUN_ID: ${{ inputs.run_id }} + MANUAL_DRY_RUN: ${{ inputs.dry_run }} with: script: | const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); @@ -103,6 +109,14 @@ jobs: return response.data; } + function parseManualDryRun() { + if (!isWorkflowDispatch) { + return false; + } + + return String(process.env.MANUAL_DRY_RUN).toLowerCase() === 'true'; + } + async function listJobsForAttempt(runId, attemptNumber) { return paginate( 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/attempts/{attempt_number}/jobs', @@ -173,7 +187,7 @@ jobs: } const workflowRun = await getWorkflowRun(); - const dryRun = isWorkflowDispatch; + const dryRun = parseManualDryRun(); const sourceRunUrl = workflowRun.html_url || `https://github.com/${owner}/${repo}/actions/runs/${workflowRun.id}`; core.setOutput('source_run_id', String(workflowRun.id)); @@ -236,6 +250,7 @@ jobs: dryRun, rerunEligible, sourceRunUrl, + sourceRunAttempt: runAttempt, }); if (retryableJobs.length === 0) { @@ -243,11 +258,6 @@ jobs: return; } - if (dryRun) { - console.log('workflow_dispatch runs in dry-run mode. No jobs will be rerun.'); - return; - } - rerun-transient-failures: name: Rerun transient CI failures needs: [analyze-transient-failures] diff --git a/docs/ci/auto-rerun-transient-ci-failures.md b/docs/ci/auto-rerun-transient-ci-failures.md index 38bfb0aecdf..a6329136dd8 100644 --- a/docs/ci/auto-rerun-transient-ci-failures.md +++ b/docs/ci/auto-rerun-transient-ci-failures.md @@ -12,7 +12,7 @@ It is intentionally conservative: - it does not rerun every failed job in a run - it treats mixed deterministic failures plus transient post-step noise as non-retryable by default -- it keeps `workflow_dispatch` in dry-run mode for historical inspection and matcher tuning +- it keeps `workflow_dispatch` behind the same matcher and safety rails as automatic execution, with an optional dry-run mode for inspection-only runs ## Matcher behavior @@ -23,16 +23,18 @@ It is intentionally conservative: - Keep the mixed-failure veto: if an ignored step such as `Run tests*` failed, do not rerun the job based only on unrelated transient post-step noise. - Allow a narrow override when an ignored failed step is paired with a high-confidence job-level infrastructure annotation such as runner loss or action-download failure. - Allow a narrow override for Windows jobs whose failures are limited to post-test cleanup or upload steps when the annotations report process initialization failure `-1073741502` (`0xC0000142`). -- Allow a narrow log-based override for supported CI SDK bootstrap, build, package, and validation steps when the job log shows `Unable to load the service index` against the approved `dnceng` public feeds. +- Allow a narrow log-based override for non-test-execution failures when the job log shows high-confidence infrastructure network failures against approved `dnceng` public feeds, `builds.dotnet.microsoft.com`, `api.github.com`, or `github.com`. ## Safety rails -- `workflow_dispatch` remains dry-run only. It exists for historical inspection and matcher tuning, not for issuing reruns. +- `workflow_dispatch` can inspect any `CI` workflow run by ID and request reruns when the same retry-safety rules are satisfied. +- `workflow_dispatch` also exposes an optional `dry_run` input so manual runs can produce the analysis summary without sending rerun requests. - Automatic rerun requires at least one retryable job. - Automatic rerun is suppressed when matched jobs exceed the configured cap. - Before issuing reruns, the workflow confirms that at least one associated pull request is still open. - The workflow targets only the matched jobs when issuing rerun requests rather than rerunning the entire source run, although GitHub's job-rerun API also reruns dependent jobs automatically. -- The workflow summary links to the analyzed workflow run and, when reruns are requested, to both the failed attempt and the rerun attempt. +- The workflow summary clearly states whether reruns were skipped, are eligible, or were requested, and links to the analyzed workflow run. +- When reruns are requested, the rerun summary also links to both the failed attempt and the rerun attempt, plus any posted pull request comments. - After successful rerun requests, the workflow comments on the open associated pull request with links to the failed attempt, the rerun attempt, per-job failed-attempt links, and retry reasons. ## Tests @@ -44,4 +46,4 @@ Those tests are intentionally behavior-focused rather than regex-focused: - they use representative fixtures for each supported behavior - they keep representative job and step fixtures anchored to the current CI workflow names so matcher coverage does not drift from the implementation - they cover the mixed-failure veto and ignored-step override explicitly -- they keep only a minimal set of YAML contract checks for safety rails such as `workflow_dispatch` dry-run mode, first-attempt-only automatic reruns, and gating the rerun job on `rerun_eligible` +- they keep only a minimal set of YAML contract checks for safety rails such as first-attempt-only automatic reruns, the optional manual `dry_run` override, enabling manual reruns through `workflow_dispatch`, and gating the rerun job on `rerun_eligible` diff --git a/tests/Infrastructure.Tests/WorkflowScripts/AutoRerunTransientCiFailuresTests.cs b/tests/Infrastructure.Tests/WorkflowScripts/AutoRerunTransientCiFailuresTests.cs index c49947e47aa..68404ba76d5 100644 --- a/tests/Infrastructure.Tests/WorkflowScripts/AutoRerunTransientCiFailuresTests.cs +++ b/tests/Infrastructure.Tests/WorkflowScripts/AutoRerunTransientCiFailuresTests.cs @@ -64,7 +64,7 @@ public async Task SkipsJobsWhoseFailedStepsAreOutsideRetrySafeAllowlist() Assert.Empty(result.RetryableJobs); Assert.Single(result.SkippedJobs); - Assert.Equal("Failed steps are outside the retry-safe allowlist.", result.SkippedJobs[0].Reason); + Assert.Equal("Failed step 'Compile project' is not covered by the retry-safe rerun rules.", result.SkippedJobs[0].Reason); } [Fact] @@ -77,7 +77,7 @@ public async Task SkipsRetrySafeStepsWhenAnnotationsAreGeneric() Assert.Empty(result.RetryableJobs); Assert.Single(result.SkippedJobs); - Assert.Equal("Annotations did not match the transient allowlist.", result.SkippedJobs[0].Reason); + Assert.Equal("Failed step 'Set up .NET Core' did not include a retry-safe transient infrastructure signal in the job annotations.", result.SkippedJobs[0].Reason); } [Theory] @@ -105,7 +105,9 @@ public async Task KeepsMixedFailureVetoWhenIgnoredTestStepsFailAlongsideRetrySaf Assert.Empty(result.RetryableJobs); Assert.Single(result.SkippedJobs); - Assert.Equal("Annotations did not match the transient allowlist.", result.SkippedJobs[0].Reason); + Assert.Equal( + "Failed steps 'Run tests (Windows) | Upload logs, and test results' include a test execution failure, so the job was not retried without a high-confidence infrastructure override.", + result.SkippedJobs[0].Reason); } [Fact] @@ -156,7 +158,9 @@ public async Task DoesNotOverrideWindowsProcessInitializationFailuresWhenTestExe Assert.Empty(result.RetryableJobs); Assert.Single(result.SkippedJobs); - Assert.Equal("Annotations did not match the transient allowlist.", result.SkippedJobs[0].Reason); + Assert.Equal( + "Failed steps 'Run tests (Windows) | Upload logs, and test results | Generate test results summary' include a test execution failure, so the job was not retried without a high-confidence infrastructure override.", + result.SkippedJobs[0].Reason); } [Fact] @@ -171,7 +175,9 @@ public async Task AllowsNarrowLogBasedOverrideForDncengFeedServiceIndexFailuresI "error : Unable to load the service index for source https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json."); Assert.Single(result.RetryableJobs); - Assert.Equal("Ignored failed step 'Build test project' matched the feed network failure override allowlist.", result.RetryableJobs[0].Reason); + Assert.Equal( + "Failed step 'Build test project' will be retried because the job log shows a likely transient infrastructure network failure. Matched pattern: /Unable to load the service index for source https:\\/\\/(?:pkgs\\.dev\\.azure\\.com\\/dnceng|dnceng\\.pkgs\\.visualstudio\\.com)\\/public\\/_packaging\\//i.", + result.RetryableJobs[0].Reason); } [Theory] @@ -190,7 +196,44 @@ public async Task AllowsSameFeedOverrideForOtherCiBootstrapBuildAndValidationSte "error : Unable to load the service index for source https://dnceng.pkgs.visualstudio.com/public/_packaging/dotnet9-transport/nuget/v3/index.json."); Assert.Single(result.RetryableJobs); - Assert.Equal($"Ignored failed step '{failedStep}' matched the feed network failure override allowlist.", result.RetryableJobs[0].Reason); + Assert.Equal( + $"Failed step '{failedStep}' will be retried because the job log shows a likely transient infrastructure network failure. Matched pattern: /Unable to load the service index for source https:\\/\\/(?:pkgs\\.dev\\.azure\\.com\\/dnceng|dnceng\\.pkgs\\.visualstudio\\.com)\\/public\\/_packaging\\//i.", + result.RetryableJobs[0].Reason); + } + + [Fact] + [RequiresTools(["node"])] + public async Task AllowsSameNetworkOverrideForBroaderBootstrapStepsOutsideTheOldAllowlist() + { + WorkflowJob job = CreateJob(failedSteps: ["Run ./.github/actions/enumerate-tests"]); + + AnalyzeFailedJobsResult result = await AnalyzeSingleJobAsync( + job, + "Process completed with exit code 1.", + "error : Unable to load the service index for source https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json."); + + Assert.Single(result.RetryableJobs); + Assert.Equal( + "Failed step 'Run ./.github/actions/enumerate-tests' will be retried because the job log shows a likely transient infrastructure network failure. Matched pattern: /Unable to load the service index for source https:\\/\\/(?:pkgs\\.dev\\.azure\\.com\\/dnceng|dnceng\\.pkgs\\.visualstudio\\.com)\\/public\\/_packaging\\//i.", + result.RetryableJobs[0].Reason); + } + + [Fact] + [RequiresTools(["node"])] + public async Task DoesNotApplyBroadNetworkOverrideWhenTestExecutionFailed() + { + WorkflowJob job = CreateJob(failedSteps: ["Run tests (Windows)"]); + + AnalyzeFailedJobsResult result = await AnalyzeSingleJobAsync( + job, + "Process completed with exit code 1.", + "error : Unable to load the service index for source https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json."); + + Assert.Empty(result.RetryableJobs); + Assert.Single(result.SkippedJobs); + Assert.Equal( + "Failed step 'Run tests (Windows)' includes a test execution failure, so the job was not retried without a high-confidence infrastructure override.", + result.SkippedJobs[0].Reason); } [Fact] @@ -206,7 +249,9 @@ public async Task DoesNotRetryIgnoredBuildStepsWhenTheLogLacksFeedNetworkSignatu Assert.Empty(result.RetryableJobs); Assert.Single(result.SkippedJobs); - Assert.Equal("Annotations did not match the transient allowlist.", result.SkippedJobs[0].Reason); + Assert.Equal( + "Failed step 'Build test project' is only retried when the job shows a high-confidence infrastructure override, and none was found.", + result.SkippedJobs[0].Reason); } [Fact] @@ -266,7 +311,7 @@ public async Task GetCheckRunIdForJobReturnsNullWhenNoCheckRunIdCanBeResolved() [Fact] [RequiresTools(["node"])] - public async Task WorkflowDispatchStaysDryRunEvenWhenRetryableJobsExist() + public async Task ComputeRerunEligibilityDisablesRerunsWhenDryRunIsEnabled() { bool rerunEligible = await InvokeHarnessAsync( "computeRerunEligibility", @@ -356,6 +401,7 @@ public async Task RepresentativeWorkflowFixturesStayAlignedWithCurrentWorkflowDe ], [".github/workflows/tests.yml"] = [ + "- uses: ./.github/actions/enumerate-tests", "name: Final Test Results", ], }; @@ -377,13 +423,18 @@ public async Task WorkflowYamlKeepsDocumentedSafetyRails() string workflowText = await ReadRepoFileAsync(".github/workflows/auto-rerun-transient-ci-failures.yml"); Assert.Contains("workflow_dispatch:", workflowText); + Assert.Contains("dry_run:", workflowText); + Assert.Contains("default: false", workflowText); Assert.Contains("github.event.workflow_run.run_attempt == 1", workflowText); Assert.Contains("needs.analyze-transient-failures.outputs.rerun_eligible == 'true'", workflowText); + Assert.Contains("MANUAL_DRY_RUN", workflowText); + Assert.Contains("function parseManualDryRun()", workflowText); + Assert.Contains("const dryRun = parseManualDryRun();", workflowText); } [Fact] [RequiresTools(["node"])] - public async Task WriteAnalysisSummaryLinksTheAnalyzedWorkflowRun() + public async Task WriteAnalysisSummaryUsesExplicitOutcomeHeadingAndClickableAnalyzedRunLink() { SummaryResult result = await InvokeHarnessAsync( "writeAnalysisSummary", @@ -400,11 +451,18 @@ public async Task WriteAnalysisSummaryLinksTheAnalyzedWorkflowRun() skippedJobs = Array.Empty(), dryRun = false, rerunEligible = true, + sourceRunAttempt = 1, sourceRunUrl = "https://github.com/dotnet/aspire/actions/runs/123" }); - SummaryEvent rawEvent = Assert.Single(result.Events, e => e.Type == "raw"); - Assert.Equal("Source run: [workflow run](https://github.com/dotnet/aspire/actions/runs/123)\n\n", rawEvent.Text); + SummaryEvent headingEvent = Assert.Single(result.Events, e => e.Type == "heading" && e.Level == 1); + Assert.Equal("Rerun eligible", headingEvent.Text); + + SummaryEvent linkEvent = Assert.Single(result.Events, e => e.Type == "link" && e.Text == "workflow run attempt 1"); + Assert.Equal("https://github.com/dotnet/aspire/actions/runs/123/attempts/1", linkEvent.Href); + + SummaryEvent rawEvent = Assert.Single(result.Events, e => e.Type == "raw" && e.Text == "Matched 1 retry-safe job for rerun."); + Assert.Equal("Matched 1 retry-safe job for rerun.", rawEvent.Text); } [Fact] @@ -457,6 +515,10 @@ public async Task RerunMatchedJobsRequestsOneRerunPerSelectedJobAndWritesTheSumm { ["15110"] = "open" }, + commentHtmlUrlByNumber = new Dictionary + { + ["15110"] = "https://github.com/dotnet/aspire/pull/15110#issuecomment-123" + }, latestRunAttempt = 2, sourceRunId = 123, sourceRunAttempt = 1, @@ -496,9 +558,17 @@ public async Task RerunMatchedJobsRequestsOneRerunPerSelectedJobAndWritesTheSumm request.Payload.GetProperty("body").GetString()); }); - SummaryEvent rawEvent = Assert.Single(result.Events, e => e.Type == "raw" && e.Text is not null && e.Text.Contains("Failed attempt:")); - Assert.Contains("Failed attempt: [workflow run attempt 1](https://github.com/dotnet/aspire/actions/runs/123/attempts/1)", rawEvent.Text); - Assert.Contains("Rerun attempt: [workflow run attempt 2](https://github.com/dotnet/aspire/actions/runs/123/attempts/2)", rawEvent.Text); + SummaryEvent headingEvent = Assert.Single(result.Events, e => e.Type == "heading" && e.Level == 1); + Assert.Equal("Rerun requested", headingEvent.Text); + + SummaryEvent failedAttemptLink = Assert.Single(result.Events, e => e.Type == "link" && e.Text == "workflow run attempt 1"); + Assert.Equal("https://github.com/dotnet/aspire/actions/runs/123/attempts/1", failedAttemptLink.Href); + + SummaryEvent rerunAttemptLink = Assert.Single(result.Events, e => e.Type == "link" && e.Text == "workflow run attempt 2"); + Assert.Equal("https://github.com/dotnet/aspire/actions/runs/123/attempts/2", rerunAttemptLink.Href); + + SummaryEvent commentLink = Assert.Single(result.Events, e => e.Type == "link" && e.Text == "PR #15110 comment"); + Assert.Equal("https://github.com/dotnet/aspire/pull/15110#issuecomment-123", commentLink.Href); SummaryEvent tableEvent = Assert.Single(result.Events, e => e.Type == "table"); Assert.Equal("Job", tableEvent.Rows[0][0].GetProperty("data").GetString()); @@ -508,8 +578,7 @@ public async Task RerunMatchedJobsRequestsOneRerunPerSelectedJobAndWritesTheSumm Assert.Equal("Tests / Two", tableEvent.Rows[2][0].GetString()); Assert.Equal("Reason two", tableEvent.Rows[2][1].GetString()); - SummaryEvent commentEvent = Assert.Single(result.Events, e => e.Type == "raw" && e.Text is not null && e.Text.Contains("Posted rerun details to #15110.")); - Assert.Contains("Posted rerun details to #15110.", commentEvent.Text); + Assert.Contains(result.Events, e => e.Type == "raw" && e.Text == "Pull request comments:"); } [Fact] @@ -544,9 +613,12 @@ public async Task RerunMatchedJobsSkipsRerunsWhenAllAssociatedPullRequestsAreClo Assert.Equal("GET /repos/{owner}/{repo}/issues/{issue_number}", request.Route); Assert.Equal(15110, request.Payload.GetProperty("issue_number").GetInt32()); - SummaryEvent skippedHeading = Assert.Single(result.Events, e => e.Type == "heading" && e.Text == "Automatic rerun skipped"); + SummaryEvent skippedHeading = Assert.Single(result.Events, e => e.Type == "heading" && e.Text == "Rerun skipped"); Assert.Equal(1, skippedHeading.Level); + SummaryEvent analyzedRunLink = Assert.Single(result.Events, e => e.Type == "link" && e.Text == "workflow run"); + Assert.Equal("https://github.com/dotnet/aspire/actions/runs/123", analyzedRunLink.Href); + SummaryEvent skippedRaw = Assert.Single(result.Events, e => e.Type == "raw" && e.Text is not null && e.Text.Contains("All associated pull requests are closed.")); Assert.Contains("All associated pull requests are closed. No jobs were rerun.", skippedRaw.Text); } @@ -691,6 +763,7 @@ private sealed class SummaryEvent { public string Type { get; init; } = string.Empty; public string? Text { get; init; } + public string? Href { get; init; } public int? Level { get; init; } public bool? AddEol { get; init; } public JsonElement[][] Rows { get; init; } = []; diff --git a/tests/Infrastructure.Tests/WorkflowScripts/auto-rerun-transient-ci-failures.harness.js b/tests/Infrastructure.Tests/WorkflowScripts/auto-rerun-transient-ci-failures.harness.js index 5f2db9c0afb..50e84244bd5 100644 --- a/tests/Infrastructure.Tests/WorkflowScripts/auto-rerun-transient-ci-failures.harness.js +++ b/tests/Infrastructure.Tests/WorkflowScripts/auto-rerun-transient-ci-failures.harness.js @@ -21,6 +21,16 @@ class SummaryRecorder { return this; } + addLink(text, href) { + this.events.push({ type: 'link', text, href }); + return this; + } + + addBreak() { + this.events.push({ type: 'break' }); + return this; + } + async write() { this.events.push({ type: 'write' }); return this; @@ -111,6 +121,18 @@ function createGitHubRecorder(payload, requests) { }; } + if (route === 'POST /repos/{owner}/{repo}/issues/{issue_number}/comments') { + const issueNumber = String(requestPayload.issue_number); + const htmlUrl = payload.commentHtmlUrlByNumber?.[issueNumber] + ?? `https://github.com/${requestPayload.owner}/${requestPayload.repo}/pull/${issueNumber}#issuecomment-${issueNumber}`; + + return { + data: { + html_url: htmlUrl, + }, + }; + } + return { data: {} }; }, };