Skip to content

Commit ec4d2e7

Browse files
committed
Validate census archive extraction paths
1 parent 3c360ec commit ec4d2e7

3 files changed

Lines changed: 185 additions & 32 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Validate extracted census archive paths before unpacking downloaded state block archives.
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import zipfile
2+
from pathlib import Path
3+
4+
import pytest
5+
6+
from policyengine_us.tools.geography.download_50_state_census_block_data import (
7+
MAX_ARCHIVE_DOWNLOAD_BYTES,
8+
MAX_ARCHIVE_UNCOMPRESSED_BYTES,
9+
_download_with_limits,
10+
_safe_extract,
11+
)
12+
13+
14+
def test_safe_extract_rejects_zip_slip(tmp_path):
15+
archive_path = tmp_path / "malicious.zip"
16+
extract_dir = tmp_path / "extract"
17+
18+
with zipfile.ZipFile(archive_path, "w") as zip_ref:
19+
zip_ref.writestr("../evil.txt", "pwned")
20+
21+
with zipfile.ZipFile(archive_path, "r") as zip_ref:
22+
with pytest.raises(ValueError, match="Unsafe path"):
23+
_safe_extract(zip_ref, extract_dir)
24+
25+
assert not (tmp_path / "evil.txt").exists()
26+
assert not (extract_dir / "evil.txt").exists()
27+
28+
29+
def test_safe_extract_rejects_oversized_archives(tmp_path):
30+
class FakeMember:
31+
def __init__(self, filename: str, file_size: int):
32+
self.filename = filename
33+
self.file_size = file_size
34+
35+
class FakeZipFile:
36+
def __init__(self):
37+
self.extracted = False
38+
39+
def infolist(self):
40+
return [FakeMember("data.txt", MAX_ARCHIVE_UNCOMPRESSED_BYTES + 1)]
41+
42+
def extractall(self, destination):
43+
self.extracted = True
44+
45+
extract_dir = tmp_path / "extract"
46+
fake_zip = FakeZipFile()
47+
48+
with pytest.raises(ValueError, match="uncompressed size limit"):
49+
_safe_extract(fake_zip, extract_dir)
50+
51+
assert fake_zip.extracted is False
52+
53+
54+
def test_download_with_limits_rejects_oversized_responses(tmp_path, monkeypatch):
55+
class FakeResponse:
56+
def __enter__(self):
57+
return self
58+
59+
def __exit__(self, exc_type, exc, tb):
60+
return False
61+
62+
def raise_for_status(self):
63+
return None
64+
65+
def iter_content(self, chunk_size):
66+
yield b"x" * MAX_ARCHIVE_DOWNLOAD_BYTES
67+
yield b"x"
68+
69+
monkeypatch.setattr(
70+
"requests.get",
71+
lambda *args, **kwargs: FakeResponse(),
72+
)
73+
74+
destination = Path(tmp_path) / "archive.zip"
75+
76+
with pytest.raises(ValueError, match="exceeds"):
77+
_download_with_limits("https://example.com/archive.zip", destination)
78+
79+
assert not destination.exists()

policyengine_us/tools/geography/download_50_state_census_block_data.py

Lines changed: 105 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import os
2+
import shutil
3+
import zipfile
4+
from pathlib import Path
5+
16
import pandas as pd
27
import requests
38
from tqdm import tqdm
4-
from pathlib import Path
5-
import os
6-
import zipfile
7-
import shutil
89

910
STATE_NAMES = [
1011
"Alabama",
@@ -112,31 +113,103 @@
112113
]
113114
STATE_CODES = [x.lower() for x in STATE_CODES]
114115
DATA_FOLDER = Path("data")
115-
DATA_FOLDER.mkdir(exist_ok=True)
116-
117-
dfs = []
118-
for state_name, state_code in tqdm(
119-
zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
120-
):
121-
data_url = f"https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
122-
# Download the file and save to a folder called "block_level_population_data_by_state/"
123-
r = requests.get(data_url)
124-
with open(DATA_FOLDER / f"{state_code}2020.pl.zip", "wb") as f:
125-
f.write(r.content)
126-
# Unzip the file
127-
with zipfile.ZipFile(DATA_FOLDER / f"{state_code}2020.pl.zip", "r") as zip_ref:
128-
zip_ref.extractall(DATA_FOLDER / f"{state_code}2020.pl")
129-
# Delete the zip file
130-
os.remove(DATA_FOLDER / f"{state_code}2020.pl.zip")
131-
# Read the file
132-
df = pd.read_csv(
133-
DATA_FOLDER / f"{state_code}2020.pl/{state_code}geo2020.pl",
134-
sep="|",
135-
low_memory=False,
136-
encoding="ISO-8859-1",
137-
)
138-
df["state"] = state_code
139-
dfs += [df]
140-
full_df = pd.concat(dfs)
141-
full_df.to_csv(DATA_FOLDER / "50_state_block_data.csv", index=False)
142-
shutil.rmtree(DATA_FOLDER / f"{state_code}2020.pl")
116+
DOWNLOAD_TIMEOUT_SECONDS = 60
117+
DOWNLOAD_CHUNK_SIZE = 1024 * 1024
118+
MAX_ARCHIVE_DOWNLOAD_BYTES = 100 * 1024 * 1024
119+
MAX_ARCHIVE_MEMBER_COUNT = 64
120+
MAX_ARCHIVE_UNCOMPRESSED_BYTES = 512 * 1024 * 1024
121+
122+
123+
def _download_with_limits(url: str, destination: Path) -> None:
124+
downloaded_bytes = 0
125+
126+
try:
127+
with requests.get(
128+
url, stream=True, timeout=DOWNLOAD_TIMEOUT_SECONDS
129+
) as response:
130+
response.raise_for_status()
131+
with Path(destination).open("wb") as file:
132+
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
133+
if not chunk:
134+
continue
135+
downloaded_bytes += len(chunk)
136+
if downloaded_bytes > MAX_ARCHIVE_DOWNLOAD_BYTES:
137+
raise ValueError(
138+
f"Downloaded archive exceeds {MAX_ARCHIVE_DOWNLOAD_BYTES} bytes"
139+
)
140+
file.write(chunk)
141+
except Exception:
142+
Path(destination).unlink(missing_ok=True)
143+
raise
144+
145+
146+
def _safe_extract(zip_ref: zipfile.ZipFile, destination: Path) -> None:
147+
"""Extract an archive only if every member stays within the destination."""
148+
149+
destination = Path(destination).resolve()
150+
destination.mkdir(parents=True, exist_ok=True)
151+
members = zip_ref.infolist()
152+
if len(members) > MAX_ARCHIVE_MEMBER_COUNT:
153+
raise ValueError(
154+
f"Archive contains {len(members)} files, exceeding the limit of "
155+
f"{MAX_ARCHIVE_MEMBER_COUNT}"
156+
)
157+
158+
total_uncompressed_bytes = 0
159+
for member in members:
160+
total_uncompressed_bytes += member.file_size
161+
if total_uncompressed_bytes > MAX_ARCHIVE_UNCOMPRESSED_BYTES:
162+
raise ValueError(
163+
"Archive exceeds the allowed uncompressed size limit of "
164+
f"{MAX_ARCHIVE_UNCOMPRESSED_BYTES} bytes"
165+
)
166+
target_path = (destination / member.filename).resolve()
167+
if destination != target_path and destination not in target_path.parents:
168+
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
169+
zip_ref.extractall(destination)
170+
171+
172+
def download_state_block_data(data_folder: Path = DATA_FOLDER) -> pd.DataFrame:
173+
data_folder = Path(data_folder)
174+
data_folder.mkdir(parents=True, exist_ok=True)
175+
176+
dfs = []
177+
for state_name, state_code in tqdm(
178+
zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data"
179+
):
180+
data_url = (
181+
"https://www2.census.gov/programs-surveys/decennial/2020/data/"
182+
f"01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip"
183+
)
184+
zip_path = data_folder / f"{state_code}2020.pl.zip"
185+
extract_dir = data_folder / f"{state_code}2020.pl"
186+
187+
_download_with_limits(data_url, zip_path)
188+
189+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
190+
_safe_extract(zip_ref, extract_dir)
191+
192+
os.remove(zip_path)
193+
194+
df = pd.read_csv(
195+
extract_dir / f"{state_code}geo2020.pl",
196+
sep="|",
197+
low_memory=False,
198+
encoding="ISO-8859-1",
199+
)
200+
df["state"] = state_code
201+
dfs.append(df)
202+
203+
full_df = pd.concat(dfs)
204+
full_df.to_csv(data_folder / "50_state_block_data.csv", index=False)
205+
shutil.rmtree(extract_dir)
206+
207+
return full_df
208+
209+
210+
def main() -> None:
211+
download_state_block_data(DATA_FOLDER)
212+
213+
214+
if __name__ == "__main__":
215+
main()

0 commit comments

Comments
 (0)