diff --git a/.github/workflows/linkcheck.yml b/.github/workflows/linkcheck.yml index 22022eb9..574397a0 100644 --- a/.github/workflows/linkcheck.yml +++ b/.github/workflows/linkcheck.yml @@ -24,6 +24,31 @@ jobs: run: | mkdir -p _site curl -sL "${{ steps.release.outputs.asset-url }}" | tar -xz -C _site + - name: Pre-process HTML to exclude known bot-blocking domains + env: + # Domains that block automated crawlers (space-separated). + # These are also listed in linkcheck_ignore in lectures/_config.yml. + SKIP_DOMAINS: "fred.stlouisfed.org" + run: | + python3 - <<'PYEOF' + import os + import re + from pathlib import Path + skip_domains = os.environ.get('SKIP_DOMAINS', '').split() + count = 0 + for html_file in Path('_site').rglob('*.html'): + try: + content = html_file.read_text(encoding='utf-8') + except UnicodeDecodeError as exc: + print(f"Skipping {html_file}: encoding error - {exc}") + continue + for domain in skip_domains: + pattern = r'href="https?://' + re.escape(domain) + r'[^"]*"' + content = re.sub(pattern, 'href="#"', content) + html_file.write_text(content, encoding='utf-8') + count += 1 + print(f"Pre-processing complete ({count} files processed)") + PYEOF - name: AI-Powered Link Checker uses: QuantEcon/action-link-checker@main with: