|
1 | 1 | import numpy as np |
2 | 2 | import pandas as pd |
| 3 | +from concurrent.futures import ThreadPoolExecutor |
| 4 | +import time |
3 | 5 |
|
4 | 6 | from policyengine_us_data.datasets.cps import cps as cps_module |
5 | 7 | from policyengine_us_data.datasets.org import ( |
|
10 | 12 | CPS_BASIC_MONTHLY_ORG_COLUMNS, |
11 | 13 | _build_union_priority_weights, |
12 | 14 | _load_cps_basic_org_month, |
| 15 | + load_org_training_data, |
13 | 16 | _predict_union_coverage_from_bls_tables, |
14 | 17 | _select_cps_basic_org_columns, |
15 | 18 | _transform_cps_basic_org_month, |
@@ -154,42 +157,199 @@ def test_load_cps_basic_org_month_retries_after_transient_parser_failure( |
154 | 157 | monkeypatch, |
155 | 158 | ): |
156 | 159 | calls = [] |
157 | | - month_df = pd.DataFrame( |
158 | | - { |
159 | | - "hrmis": [4], |
160 | | - "GESTFIPS": [6], |
161 | | - "PRTAGE": [30], |
162 | | - "PESEX": [2], |
163 | | - "PTDTRACE": [1], |
164 | | - "PEHSPNON": [2], |
165 | | - "PWORWGT": [100.0], |
166 | | - "PTERNWA": [100000.0], |
167 | | - "PTERNHLY": [2500.0], |
168 | | - "PEERNHRY": [1], |
169 | | - "PEHRUSLT": [40.0], |
170 | | - "PRERELG": [1], |
171 | | - "PEMLR": [1], |
172 | | - "PEIO1COW": [1], |
173 | | - } |
| 160 | + csv_text = ( |
| 161 | + "hrmis,GESTFIPS,PRTAGE,PESEX,PTDTRACE,PEHSPNON,PWORWGT," |
| 162 | + "PTERNWA,PTERNHLY,PEERNHRY,PEHRUSLT,PRERELG,PEMLR,PEIO1COW\n" |
| 163 | + "4,6,30,2,1,2,100.0,100000.0,2500.0,1,40.0,1,1,1\n" |
174 | 164 | ) |
175 | 165 |
|
176 | | - def fake_read_csv(*args, **kwargs): |
| 166 | + class FakeResponse: |
| 167 | + def __init__(self, text: str, status_code: int = 200): |
| 168 | + self.content = text.encode("utf-8") |
| 169 | + self.status_code = status_code |
| 170 | + |
| 171 | + def raise_for_status(self): |
| 172 | + if self.status_code >= 400: |
| 173 | + raise ValueError("bad status") |
| 174 | + |
| 175 | + responses = [ |
| 176 | + FakeResponse("<html>temporary error</html>"), |
| 177 | + FakeResponse(csv_text), |
| 178 | + ] |
| 179 | + |
| 180 | + def fake_get(*args, **kwargs): |
177 | 181 | calls.append(kwargs) |
178 | | - if len(calls) == 1: |
179 | | - raise ValueError("Usecols do not match columns") |
180 | | - return month_df |
| 182 | + return responses.pop(0) |
181 | 183 |
|
182 | | - monkeypatch.setattr( |
183 | | - "policyengine_us_data.datasets.org.org.pd.read_csv", fake_read_csv |
184 | | - ) |
| 184 | + monkeypatch.setattr("policyengine_us_data.datasets.org.org.requests.get", fake_get) |
185 | 185 |
|
186 | 186 | loaded = _load_cps_basic_org_month(2024, "may", max_attempts=2) |
187 | 187 |
|
188 | 188 | assert len(calls) == 2 |
189 | | - assert callable(calls[0]["usecols"]) |
190 | 189 | assert loaded.columns.tolist() == CPS_BASIC_MONTHLY_ORG_COLUMNS |
191 | 190 |
|
192 | 191 |
|
| 192 | +def test_load_cps_basic_org_month_reorders_file_order_columns(monkeypatch): |
| 193 | + csv_text = ( |
| 194 | + "PTERNWA,PEHRUSLT,hrmis,PEMLR,PEERNHRY,PEHSPNON,PRTAGE," |
| 195 | + "PTDTRACE,pworwgt,peio1cow,GESTFIPS,PESEX,PTERNHLY,PRERELG\n" |
| 196 | + "100000.0,40.0,4,1,1,2,30,1,100.0,1,6,2,2500.0,1\n" |
| 197 | + ) |
| 198 | + |
| 199 | + class FakeResponse: |
| 200 | + def __init__(self, text: str): |
| 201 | + self.content = text.encode("utf-8") |
| 202 | + |
| 203 | + def raise_for_status(self): |
| 204 | + return None |
| 205 | + |
| 206 | + monkeypatch.setattr( |
| 207 | + "policyengine_us_data.datasets.org.org.requests.get", |
| 208 | + lambda *args, **kwargs: FakeResponse(csv_text), |
| 209 | + ) |
| 210 | + |
| 211 | + loaded = _load_cps_basic_org_month(2024, "may", max_attempts=1) |
| 212 | + |
| 213 | + assert loaded.columns.tolist() == CPS_BASIC_MONTHLY_ORG_COLUMNS |
| 214 | + assert loaded.iloc[0].to_dict() == { |
| 215 | + "HRMIS": 4, |
| 216 | + "gestfips": 6, |
| 217 | + "prtage": 30, |
| 218 | + "pesex": 2, |
| 219 | + "ptdtrace": 1, |
| 220 | + "pehspnon": 2, |
| 221 | + "pworwgt": 100.0, |
| 222 | + "pternwa": 100000.0, |
| 223 | + "pternhly": 2500.0, |
| 224 | + "peernhry": 1, |
| 225 | + "pehruslt": 40.0, |
| 226 | + "prerelg": 1, |
| 227 | + "pemlr": 1, |
| 228 | + "peio1cow": 1, |
| 229 | + } |
| 230 | + |
| 231 | + |
| 232 | +def test_load_org_training_data_serializes_first_cache_build(monkeypatch, tmp_path): |
| 233 | + raw_month = pd.DataFrame( |
| 234 | + { |
| 235 | + "HRMIS": [4], |
| 236 | + "gestfips": [6], |
| 237 | + "prtage": [30], |
| 238 | + "pesex": [2], |
| 239 | + "ptdtrace": [1], |
| 240 | + "pehspnon": [2], |
| 241 | + "pworwgt": [100.0], |
| 242 | + "pternwa": [100000.0], |
| 243 | + "pternhly": [2500.0], |
| 244 | + "peernhry": [1], |
| 245 | + "pehruslt": [40.0], |
| 246 | + "prerelg": [1], |
| 247 | + "pemlr": [1], |
| 248 | + "peio1cow": [1], |
| 249 | + } |
| 250 | + ) |
| 251 | + call_count = {"value": 0} |
| 252 | + |
| 253 | + monkeypatch.setattr( |
| 254 | + "policyengine_us_data.datasets.org.org.STORAGE_FOLDER", tmp_path |
| 255 | + ) |
| 256 | + monkeypatch.setattr( |
| 257 | + "policyengine_us_data.datasets.org.org.ORG_MONTHS", |
| 258 | + ("may",), |
| 259 | + ) |
| 260 | + |
| 261 | + def fake_load_month(year, month): |
| 262 | + call_count["value"] += 1 |
| 263 | + time.sleep(0.2) |
| 264 | + return raw_month.copy() |
| 265 | + |
| 266 | + monkeypatch.setattr( |
| 267 | + "policyengine_us_data.datasets.org.org._load_cps_basic_org_month", |
| 268 | + fake_load_month, |
| 269 | + ) |
| 270 | + |
| 271 | + load_org_training_data.cache_clear() |
| 272 | + try: |
| 273 | + with ThreadPoolExecutor(max_workers=2) as executor: |
| 274 | + left = executor.submit(load_org_training_data) |
| 275 | + right = executor.submit(load_org_training_data) |
| 276 | + left_result = left.result() |
| 277 | + right_result = right.result() |
| 278 | + finally: |
| 279 | + load_org_training_data.cache_clear() |
| 280 | + |
| 281 | + assert call_count["value"] == 1 |
| 282 | + pd.testing.assert_frame_equal(left_result, right_result) |
| 283 | + |
| 284 | + |
| 285 | +def test_load_org_training_data_rebuilds_invalid_cached_file(monkeypatch, tmp_path): |
| 286 | + raw_month = pd.DataFrame( |
| 287 | + { |
| 288 | + "HRMIS": [4], |
| 289 | + "gestfips": [6], |
| 290 | + "prtage": [30], |
| 291 | + "pesex": [2], |
| 292 | + "ptdtrace": [1], |
| 293 | + "pehspnon": [2], |
| 294 | + "pworwgt": [100.0], |
| 295 | + "pternwa": [100000.0], |
| 296 | + "pternhly": [2500.0], |
| 297 | + "peernhry": [1], |
| 298 | + "pehruslt": [40.0], |
| 299 | + "prerelg": [1], |
| 300 | + "pemlr": [1], |
| 301 | + "peio1cow": [1], |
| 302 | + } |
| 303 | + ) |
| 304 | + cache_path = tmp_path / "census_cps_org_2024_wages.csv.gz" |
| 305 | + pd.DataFrame(columns=["employment_income", "weekly_hours_worked"]).to_csv( |
| 306 | + cache_path, |
| 307 | + index=False, |
| 308 | + compression="gzip", |
| 309 | + ) |
| 310 | + call_count = {"value": 0} |
| 311 | + |
| 312 | + monkeypatch.setattr( |
| 313 | + "policyengine_us_data.datasets.org.org.STORAGE_FOLDER", tmp_path |
| 314 | + ) |
| 315 | + monkeypatch.setattr( |
| 316 | + "policyengine_us_data.datasets.org.org.ORG_MONTHS", |
| 317 | + ("may",), |
| 318 | + ) |
| 319 | + |
| 320 | + def fake_load_month(year, month): |
| 321 | + call_count["value"] += 1 |
| 322 | + return raw_month.copy() |
| 323 | + |
| 324 | + monkeypatch.setattr( |
| 325 | + "policyengine_us_data.datasets.org.org._load_cps_basic_org_month", |
| 326 | + fake_load_month, |
| 327 | + ) |
| 328 | + |
| 329 | + load_org_training_data.cache_clear() |
| 330 | + try: |
| 331 | + rebuilt = load_org_training_data() |
| 332 | + finally: |
| 333 | + load_org_training_data.cache_clear() |
| 334 | + |
| 335 | + assert call_count["value"] == 1 |
| 336 | + assert not rebuilt.empty |
| 337 | + assert set( |
| 338 | + [ |
| 339 | + "employment_income", |
| 340 | + "weekly_hours_worked", |
| 341 | + "age", |
| 342 | + "is_female", |
| 343 | + "is_hispanic", |
| 344 | + "race_wbho", |
| 345 | + "state_fips", |
| 346 | + "hourly_wage", |
| 347 | + "is_paid_hourly", |
| 348 | + "sample_weight", |
| 349 | + ] |
| 350 | + ).issubset(rebuilt.columns) |
| 351 | + |
| 352 | + |
193 | 353 | def test_build_union_priority_weights_reflect_bls_demographics(): |
194 | 354 | receiver = pd.DataFrame( |
195 | 355 | { |
|
0 commit comments