Validate census archive extraction paths

MaxGhenis · MaxGhenis · commit ec4d2e7a2e4b · 2026-04-12T15:11:59.000-04:00
diff --git a/changelog.d/safe-census-zip-extraction.fixed.md b/changelog.d/safe-census-zip-extraction.fixed.md
@@ -0,0 +1 @@
+Validate extracted census archive paths before unpacking downloaded state block archives.
diff --git a/policyengine_us/tests/utilities/test_download_50_state_census_block_data.py b/policyengine_us/tests/utilities/test_download_50_state_census_block_data.py
@@ -0,0 +1,79 @@
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from policyengine_us.tools.geography.download_50_state_census_block_data import (
+    MAX_ARCHIVE_DOWNLOAD_BYTES,
+    MAX_ARCHIVE_UNCOMPRESSED_BYTES,
+    _download_with_limits,
+    _safe_extract,
+)
+
+
+def test_safe_extract_rejects_zip_slip(tmp_path):
+    archive_path = tmp_path / "malicious.zip"
+    extract_dir = tmp_path / "extract"
+
+    with zipfile.ZipFile(archive_path, "w") as zip_ref:
+        zip_ref.writestr("../evil.txt", "pwned")
+
+    with zipfile.ZipFile(archive_path, "r") as zip_ref:
+        with pytest.raises(ValueError, match="Unsafe path"):
+            _safe_extract(zip_ref, extract_dir)
+
+    assert not (tmp_path / "evil.txt").exists()
+    assert not (extract_dir / "evil.txt").exists()
+
+
+def test_safe_extract_rejects_oversized_archives(tmp_path):
+    class FakeMember:
+        def __init__(self, filename: str, file_size: int):
+            self.filename = filename
+            self.file_size = file_size
+
+    class FakeZipFile:
+        def __init__(self):
+            self.extracted = False
+
+        def infolist(self):
+            return [FakeMember("data.txt", MAX_ARCHIVE_UNCOMPRESSED_BYTES + 1)]
+
+        def extractall(self, destination):
+            self.extracted = True
+
+    extract_dir = tmp_path / "extract"
+    fake_zip = FakeZipFile()
+
+    with pytest.raises(ValueError, match="uncompressed size limit"):
+        _safe_extract(fake_zip, extract_dir)
+
+    assert fake_zip.extracted is False
+
+
+def test_download_with_limits_rejects_oversized_responses(tmp_path, monkeypatch):
+    class FakeResponse:
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def raise_for_status(self):
+            return None
+
+        def iter_content(self, chunk_size):
+            yield b"x" * MAX_ARCHIVE_DOWNLOAD_BYTES
+            yield b"x"
+
+    monkeypatch.setattr(
+        "requests.get",
+        lambda *args, **kwargs: FakeResponse(),
+    )
+
+    destination = Path(tmp_path) / "archive.zip"
+
+    with pytest.raises(ValueError, match="exceeds"):
+        _download_with_limits("https://example.com/archive.zip", destination)
+
+    assert not destination.exists()
diff --git a/policyengine_us/tools/geography/download_50_state_census_block_data.py b/policyengine_us/tools/geography/download_50_state_census_block_data.py
@@ -1,10 +1,11 @@
+import os
+import shutil
+import zipfile
+from pathlib import Path
+
 import pandas as pd
 import requests
 from tqdm import tqdm
-from pathlib import Path
-import os
-import zipfile
-import shutil
 
 STATE_NAMES = [
     "Alabama",
@@ -112,31 +113,103 @@
 ]
 STATE_CODES = [x.lower() for x in STATE_CODES]
 DATA_FOLDER = Path("data")
-DATA_FOLDER.mkdir(exist_ok=True)
-
-dfs = []
-for state_name, state_code in tqdm(
-    zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
-):
-    data_url = f"https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
-    # Download the file and save to a folder called "block_level_population_data_by_state/"
-    r = requests.get(data_url)
-    with open(DATA_FOLDER / f"{state_code}2020.pl.zip", "wb") as f:
-        f.write(r.content)
-    # Unzip the file
-    with zipfile.ZipFile(DATA_FOLDER / f"{state_code}2020.pl.zip", "r") as zip_ref:
-        zip_ref.extractall(DATA_FOLDER / f"{state_code}2020.pl")
-    # Delete the zip file
-    os.remove(DATA_FOLDER / f"{state_code}2020.pl.zip")
-    # Read the file
-    df = pd.read_csv(
-        DATA_FOLDER / f"{state_code}2020.pl/{state_code}geo2020.pl",
-        sep="|",
-        low_memory=False,
-        encoding="ISO-8859-1",
-    )
-    df["state"] = state_code
-    dfs += [df]
-    full_df = pd.concat(dfs)
-    full_df.to_csv(DATA_FOLDER / "50_state_block_data.csv", index=False)
-    shutil.rmtree(DATA_FOLDER / f"{state_code}2020.pl")
+DOWNLOAD_TIMEOUT_SECONDS = 60
+DOWNLOAD_CHUNK_SIZE = 1024 * 1024
+MAX_ARCHIVE_DOWNLOAD_BYTES = 100 * 1024 * 1024
+MAX_ARCHIVE_MEMBER_COUNT = 64
+MAX_ARCHIVE_UNCOMPRESSED_BYTES = 512 * 1024 * 1024
+
+
+def _download_with_limits(url: str, destination: Path) -> None:
+    downloaded_bytes = 0
+
+    try:
+        with requests.get(
+            url, stream=True, timeout=DOWNLOAD_TIMEOUT_SECONDS
+        ) as response:
+            response.raise_for_status()
+            with Path(destination).open("wb") as file:
+                for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
+                    if not chunk:
+                        continue
+                    downloaded_bytes += len(chunk)
+                    if downloaded_bytes > MAX_ARCHIVE_DOWNLOAD_BYTES:
+                        raise ValueError(
+                            f"Downloaded archive exceeds {MAX_ARCHIVE_DOWNLOAD_BYTES} bytes"
+                        )
+                    file.write(chunk)
+    except Exception:
+        Path(destination).unlink(missing_ok=True)
+        raise
+
+
+def _safe_extract(zip_ref: zipfile.ZipFile, destination: Path) -> None:
+    """Extract an archive only if every member stays within the destination."""
+
+    destination = Path(destination).resolve()
+    destination.mkdir(parents=True, exist_ok=True)
+    members = zip_ref.infolist()
+    if len(members) > MAX_ARCHIVE_MEMBER_COUNT:
+        raise ValueError(
+            f"Archive contains {len(members)} files, exceeding the limit of "
+            f"{MAX_ARCHIVE_MEMBER_COUNT}"
+        )
+
+    total_uncompressed_bytes = 0
+    for member in members:
+        total_uncompressed_bytes += member.file_size
+        if total_uncompressed_bytes > MAX_ARCHIVE_UNCOMPRESSED_BYTES:
+            raise ValueError(
+                "Archive exceeds the allowed uncompressed size limit of "
+                f"{MAX_ARCHIVE_UNCOMPRESSED_BYTES} bytes"
+            )
+        target_path = (destination / member.filename).resolve()
+        if destination != target_path and destination not in target_path.parents:
+            raise ValueError(f"Unsafe path in zip archive: {member.filename}")
+    zip_ref.extractall(destination)
+
+
+def download_state_block_data(data_folder: Path = DATA_FOLDER) -> pd.DataFrame:
+    data_folder = Path(data_folder)
+    data_folder.mkdir(parents=True, exist_ok=True)
+
+    dfs = []
+    for state_name, state_code in tqdm(
+        zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
+    ):
+        data_url = (
+            "https://www2.census.gov/programs-surveys/decennial/2020/data/"
+            f"01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
+        )
+        zip_path = data_folder / f"{state_code}2020.pl.zip"
+        extract_dir = data_folder / f"{state_code}2020.pl"
+
+        _download_with_limits(data_url, zip_path)
+
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            _safe_extract(zip_ref, extract_dir)
+
+        os.remove(zip_path)
+
+        df = pd.read_csv(
+            extract_dir / f"{state_code}geo2020.pl",
+            sep="|",
+            low_memory=False,
+            encoding="ISO-8859-1",
+        )
+        df["state"] = state_code
+        dfs.append(df)
+
+        full_df = pd.concat(dfs)
+        full_df.to_csv(data_folder / "50_state_block_data.csv", index=False)
+        shutil.rmtree(extract_dir)
+
+    return full_df
+
+
+def main() -> None:
+    download_state_block_data(DATA_FOLDER)
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Validate extracted census archive paths before unpacking downloaded state block archives.`