|
| 1 | +import os |
| 2 | +import shutil |
| 3 | +import zipfile |
| 4 | +from pathlib import Path |
| 5 | + |
1 | 6 | import pandas as pd |
2 | 7 | import requests |
3 | 8 | from tqdm import tqdm |
4 | | -from pathlib import Path |
5 | | -import os |
6 | | -import zipfile |
7 | | -import shutil |
8 | 9 |
|
9 | 10 | STATE_NAMES = [ |
10 | 11 | "Alabama", |
|
112 | 113 | ] |
113 | 114 | STATE_CODES = [x.lower() for x in STATE_CODES] |
114 | 115 | DATA_FOLDER = Path("data") |
115 | | -DATA_FOLDER.mkdir(exist_ok=True) |
116 | | - |
117 | | -dfs = [] |
118 | | -for state_name, state_code in tqdm( |
119 | | - zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data" |
120 | | -): |
121 | | - data_url = f"https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip" |
122 | | - # Download the file and save to a folder called "block_level_population_data_by_state/" |
123 | | - r = requests.get(data_url) |
124 | | - with open(DATA_FOLDER / f"{state_code}2020.pl.zip", "wb") as f: |
125 | | - f.write(r.content) |
126 | | - # Unzip the file |
127 | | - with zipfile.ZipFile(DATA_FOLDER / f"{state_code}2020.pl.zip", "r") as zip_ref: |
128 | | - zip_ref.extractall(DATA_FOLDER / f"{state_code}2020.pl") |
129 | | - # Delete the zip file |
130 | | - os.remove(DATA_FOLDER / f"{state_code}2020.pl.zip") |
131 | | - # Read the file |
132 | | - df = pd.read_csv( |
133 | | - DATA_FOLDER / f"{state_code}2020.pl/{state_code}geo2020.pl", |
134 | | - sep="|", |
135 | | - low_memory=False, |
136 | | - encoding="ISO-8859-1", |
137 | | - ) |
138 | | - df["state"] = state_code |
139 | | - dfs += [df] |
140 | | - full_df = pd.concat(dfs) |
141 | | - full_df.to_csv(DATA_FOLDER / "50_state_block_data.csv", index=False) |
142 | | - shutil.rmtree(DATA_FOLDER / f"{state_code}2020.pl") |
| 116 | +DOWNLOAD_TIMEOUT_SECONDS = 60 |
| 117 | +DOWNLOAD_CHUNK_SIZE = 1024 * 1024 |
| 118 | +MAX_ARCHIVE_DOWNLOAD_BYTES = 100 * 1024 * 1024 |
| 119 | +MAX_ARCHIVE_MEMBER_COUNT = 64 |
| 120 | +MAX_ARCHIVE_UNCOMPRESSED_BYTES = 512 * 1024 * 1024 |
| 121 | + |
| 122 | + |
| 123 | +def _download_with_limits(url: str, destination: Path) -> None: |
| 124 | + downloaded_bytes = 0 |
| 125 | + |
| 126 | + try: |
| 127 | + with requests.get( |
| 128 | + url, stream=True, timeout=DOWNLOAD_TIMEOUT_SECONDS |
| 129 | + ) as response: |
| 130 | + response.raise_for_status() |
| 131 | + with Path(destination).open("wb") as file: |
| 132 | + for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): |
| 133 | + if not chunk: |
| 134 | + continue |
| 135 | + downloaded_bytes += len(chunk) |
| 136 | + if downloaded_bytes > MAX_ARCHIVE_DOWNLOAD_BYTES: |
| 137 | + raise ValueError( |
| 138 | + f"Downloaded archive exceeds {MAX_ARCHIVE_DOWNLOAD_BYTES} bytes" |
| 139 | + ) |
| 140 | + file.write(chunk) |
| 141 | + except Exception: |
| 142 | + Path(destination).unlink(missing_ok=True) |
| 143 | + raise |
| 144 | + |
| 145 | + |
| 146 | +def _safe_extract(zip_ref: zipfile.ZipFile, destination: Path) -> None: |
| 147 | + """Extract an archive only if every member stays within the destination.""" |
| 148 | + |
| 149 | + destination = Path(destination).resolve() |
| 150 | + destination.mkdir(parents=True, exist_ok=True) |
| 151 | + members = zip_ref.infolist() |
| 152 | + if len(members) > MAX_ARCHIVE_MEMBER_COUNT: |
| 153 | + raise ValueError( |
| 154 | + f"Archive contains {len(members)} files, exceeding the limit of " |
| 155 | + f"{MAX_ARCHIVE_MEMBER_COUNT}" |
| 156 | + ) |
| 157 | + |
| 158 | + total_uncompressed_bytes = 0 |
| 159 | + for member in members: |
| 160 | + total_uncompressed_bytes += member.file_size |
| 161 | + if total_uncompressed_bytes > MAX_ARCHIVE_UNCOMPRESSED_BYTES: |
| 162 | + raise ValueError( |
| 163 | + "Archive exceeds the allowed uncompressed size limit of " |
| 164 | + f"{MAX_ARCHIVE_UNCOMPRESSED_BYTES} bytes" |
| 165 | + ) |
| 166 | + target_path = (destination / member.filename).resolve() |
| 167 | + if destination != target_path and destination not in target_path.parents: |
| 168 | + raise ValueError(f"Unsafe path in zip archive: {member.filename}") |
| 169 | + zip_ref.extractall(destination) |
| 170 | + |
| 171 | + |
| 172 | +def download_state_block_data(data_folder: Path = DATA_FOLDER) -> pd.DataFrame: |
| 173 | + data_folder = Path(data_folder) |
| 174 | + data_folder.mkdir(parents=True, exist_ok=True) |
| 175 | + |
| 176 | + dfs = [] |
| 177 | + for state_name, state_code in tqdm( |
| 178 | + zip(STATE_NAMES, STATE_CODES), desc="Downloading Census data" |
| 179 | + ): |
| 180 | + data_url = ( |
| 181 | + "https://www2.census.gov/programs-surveys/decennial/2020/data/" |
| 182 | + f"01-Redistricting_File--PL_94-171/{state_name}/{state_code}2020.pl.zip" |
| 183 | + ) |
| 184 | + zip_path = data_folder / f"{state_code}2020.pl.zip" |
| 185 | + extract_dir = data_folder / f"{state_code}2020.pl" |
| 186 | + |
| 187 | + _download_with_limits(data_url, zip_path) |
| 188 | + |
| 189 | + with zipfile.ZipFile(zip_path, "r") as zip_ref: |
| 190 | + _safe_extract(zip_ref, extract_dir) |
| 191 | + |
| 192 | + os.remove(zip_path) |
| 193 | + |
| 194 | + df = pd.read_csv( |
| 195 | + extract_dir / f"{state_code}geo2020.pl", |
| 196 | + sep="|", |
| 197 | + low_memory=False, |
| 198 | + encoding="ISO-8859-1", |
| 199 | + ) |
| 200 | + df["state"] = state_code |
| 201 | + dfs.append(df) |
| 202 | + |
| 203 | + full_df = pd.concat(dfs) |
| 204 | + full_df.to_csv(data_folder / "50_state_block_data.csv", index=False) |
| 205 | + shutil.rmtree(extract_dir) |
| 206 | + |
| 207 | + return full_df |
| 208 | + |
| 209 | + |
| 210 | +def main() -> None: |
| 211 | + download_state_block_data(DATA_FOLDER) |
| 212 | + |
| 213 | + |
| 214 | +if __name__ == "__main__": |
| 215 | + main() |
0 commit comments