Use context managers for CPS raw stores

MaxGhenis · MaxGhenis · commit 034a03624d40 · 2026-04-06T22:36:58.000-04:00
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -53,13 +53,10 @@ def generate(self):
         cps = {}
 
         ENTITIES = ("person", "tax_unit", "family", "spm_unit", "household")
-        raw_data = self.raw_cps(require=True).load()
-        try:
+        with self.raw_cps(require=True).load() as raw_data:
             person, tax_unit, family, spm_unit, household = [
                 raw_data[entity] for entity in ENTITIES
             ]
-        finally:
-            raw_data.close()
 
         logging.info("Adding ID variables")
         add_id_variables(cps, person, tax_unit, family, spm_unit, household)
@@ -807,14 +804,10 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
         )
         return
 
-    cps_current_year_data = self.raw_cps(require=True).load()
-    try:
-        cps_previous_year_data = self.previous_year_raw_cps(require=True).load()
-    except Exception:
-        cps_current_year_data.close()
-        raise
-
-    try:
+    with (
+        self.raw_cps(require=True).load() as cps_current_year_data,
+        self.previous_year_raw_cps(require=True).load() as cps_previous_year_data,
+    ):
         cps_previous_year = cps_previous_year_data.person.set_index(
             cps_previous_year_data.person.PERIDNUM
         )
@@ -846,9 +839,6 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
                 "I_SEVAL",
             ]
         ]
-    finally:
-        cps_previous_year_data.close()
-        cps_current_year_data.close()
     joined_data["previous_year_income_available"] = (
         ~joined_data.employment_income_last_year.isna()
         & ~joined_data.self_employment_income_last_year.isna()
@@ -1775,12 +1765,9 @@ def add_tips(self, cps: h5py.File):
     # Get is_married from raw CPS data (A_MARITL codes: 1,2 = married)
     # Note: is_married in policyengine-us is Family-level, but we need
     # person-level for imputation models
-    raw_data = self.raw_cps(require=True).load()
-    try:
+    with self.raw_cps(require=True).load() as raw_data:
         raw_person = raw_data["person"]
         cps["is_married"] = raw_person.A_MARITL.isin([1, 2]).values
-    finally:
-        raw_data.close()
 
     cps["is_under_18"] = cps.age < 18
     cps["is_under_6"] = cps.age < 6
@@ -1940,51 +1927,50 @@ def add_auto_loan_interest_and_net_worth(self, cps: h5py.File) -> None:
     cps_data = self.load_dataset()
 
     # Access raw CPS for additional variables
-    raw_data_instance = self.raw_cps(require=True)
-    raw_data = raw_data_instance.load()
-    person_data = raw_data.person
-
-    # Preprocess the CPS for imputation
-    lengths = {k: len(v) for k, v in cps_data.items()}
-    var_len = cps_data["person_household_id"].shape[0]
-    vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
-    agg_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
-    agg_data["interest_dividend_income"] = np.sum(
-        [
-            agg_data["taxable_interest_income"],
-            agg_data["tax_exempt_interest_income"],
-            agg_data["qualified_dividend_income"],
-            agg_data["non_qualified_dividend_income"],
-        ],
-        axis=0,
-    )
-    agg_data["social_security_pension_income"] = np.sum(
-        [
-            agg_data["tax_exempt_private_pension_income"],
-            agg_data["taxable_private_pension_income"],
-            agg_data["social_security_retirement"],
-        ],
-        axis=0,
-    )
-
-    agg = (
-        agg_data.groupby("person_household_id")[
+    with self.raw_cps(require=True).load() as raw_data:
+        person_data = raw_data.person
+
+        # Preprocess the CPS for imputation
+        lengths = {k: len(v) for k, v in cps_data.items()}
+        var_len = cps_data["person_household_id"].shape[0]
+        vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
+        agg_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
+        agg_data["interest_dividend_income"] = np.sum(
             [
-                "employment_income",
-                "interest_dividend_income",
-                "social_security_pension_income",
+                agg_data["taxable_interest_income"],
+                agg_data["tax_exempt_interest_income"],
+                agg_data["qualified_dividend_income"],
+                agg_data["non_qualified_dividend_income"],
+            ],
+            axis=0,
+        )
+        agg_data["social_security_pension_income"] = np.sum(
+            [
+                agg_data["tax_exempt_private_pension_income"],
+                agg_data["taxable_private_pension_income"],
+                agg_data["social_security_retirement"],
+            ],
+            axis=0,
+        )
+
+        agg = (
+            agg_data.groupby("person_household_id")[
+                [
+                    "employment_income",
+                    "interest_dividend_income",
+                    "social_security_pension_income",
+                ]
             ]
-        ]
-        .sum()
-        .rename(
-            columns={
-                "employment_income": "household_employment_income",
-                "interest_dividend_income": "household_interest_dividend_income",
-                "social_security_pension_income": "household_social_security_pension_income",
-            }
+            .sum()
+            .rename(
+                columns={
+                    "employment_income": "household_employment_income",
+                    "interest_dividend_income": "household_interest_dividend_income",
+                    "social_security_pension_income": "household_social_security_pension_income",
+                }
+            )
+            .reset_index()
         )
-        .reset_index()
-    )
 
     def create_scf_reference_person_mask(cps_data, raw_person_data):
         """
@@ -2094,78 +2080,77 @@ def determine_reference_person(group):
 
         return all_persons_data["is_scf_reference_person"].values
 
-    mask = create_scf_reference_person_mask(cps_data, person_data)
-    mask_len = mask.shape[0]
+        mask = create_scf_reference_person_mask(cps_data, person_data)
+        mask_len = mask.shape[0]
 
-    cps_data = {
-        var: data[mask] if data.shape[0] == mask_len else data
-        for var, data in cps_data.items()
-    }
-
-    CPS_RACE_MAPPING = {
-        1: 1,  # White only -> WHITE
-        2: 2,  # Black only -> BLACK/AFRICAN-AMERICAN
-        3: 5,  # American Indian, Alaskan Native only -> OTHER
-        4: 4,  # Asian only -> ASIAN
-        5: 5,  # Hawaiian/Pacific Islander only -> OTHER
-        6: 5,  # White-Black -> OTHER
-        7: 5,  # White-AI -> OTHER
-        8: 5,  # White-Asian -> OTHER
-        9: 3,  # White-HP -> HISPANIC
-        10: 5,  # Black-AI -> OTHER
-        11: 5,  # Black-Asian -> OTHER
-        12: 3,  # Black-HP -> HISPANIC
-        13: 5,  # AI-Asian -> OTHER
-        14: 5,  # AI-HP -> OTHER
-        15: 3,  # Asian-HP -> HISPANIC
-        16: 5,  # White-Black-AI -> OTHER
-        17: 5,  # White-Black-Asian -> OTHER
-        18: 5,  # White-Black-HP -> OTHER
-        19: 5,  # White-AI-Asian -> OTHER
-        20: 5,  # White-AI-HP -> OTHER
-        21: 5,  # White-Asian-HP -> OTHER
-        22: 5,  # Black-AI-Asian -> OTHER
-        23: 5,  # White-Black-AI-Asian -> OTHER
-        24: 5,  # White-AI-Asian-HP -> OTHER
-        25: 5,  # Other 3 race comb. -> OTHER
-        26: 5,  # Other 4 or 5 race comb. -> OTHER
-    }
-
-    # Apply the mapping to recode the race values
-    cps_data["cps_race"] = np.vectorize(CPS_RACE_MAPPING.get)(cps_data["cps_race"])
+        cps_data = {
+            var: data[mask] if data.shape[0] == mask_len else data
+            for var, data in cps_data.items()
+        }
 
-    lengths = {k: len(v) for k, v in cps_data.items()}
-    var_len = cps_data["person_household_id"].shape[0]
-    vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
-    receiver_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
+        CPS_RACE_MAPPING = {
+            1: 1,  # White only -> WHITE
+            2: 2,  # Black only -> BLACK/AFRICAN-AMERICAN
+            3: 5,  # American Indian, Alaskan Native only -> OTHER
+            4: 4,  # Asian only -> ASIAN
+            5: 5,  # Hawaiian/Pacific Islander only -> OTHER
+            6: 5,  # White-Black -> OTHER
+            7: 5,  # White-AI -> OTHER
+            8: 5,  # White-Asian -> OTHER
+            9: 3,  # White-HP -> HISPANIC
+            10: 5,  # Black-AI -> OTHER
+            11: 5,  # Black-Asian -> OTHER
+            12: 3,  # Black-HP -> HISPANIC
+            13: 5,  # AI-Asian -> OTHER
+            14: 5,  # AI-HP -> OTHER
+            15: 3,  # Asian-HP -> HISPANIC
+            16: 5,  # White-Black-AI -> OTHER
+            17: 5,  # White-Black-Asian -> OTHER
+            18: 5,  # White-Black-HP -> OTHER
+            19: 5,  # White-AI-Asian -> OTHER
+            20: 5,  # White-AI-HP -> OTHER
+            21: 5,  # White-Asian-HP -> OTHER
+            22: 5,  # Black-AI-Asian -> OTHER
+            23: 5,  # White-Black-AI-Asian -> OTHER
+            24: 5,  # White-AI-Asian-HP -> OTHER
+            25: 5,  # Other 3 race comb. -> OTHER
+            26: 5,  # Other 4 or 5 race comb. -> OTHER
+        }
 
-    receiver_data = receiver_data.merge(
-        agg[
-            [
-                "person_household_id",
-                "household_employment_income",
-                "household_interest_dividend_income",
-                "household_social_security_pension_income",
-            ]
-        ],
-        on="person_household_id",
-        how="left",
-    )
-    receiver_data.drop("employment_income", axis=1, inplace=True)
+        # Apply the mapping to recode the race values
+        cps_data["cps_race"] = np.vectorize(CPS_RACE_MAPPING.get)(cps_data["cps_race"])
+
+        lengths = {k: len(v) for k, v in cps_data.items()}
+        var_len = cps_data["person_household_id"].shape[0]
+        vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
+        receiver_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
+
+        receiver_data = receiver_data.merge(
+            agg[
+                [
+                    "person_household_id",
+                    "household_employment_income",
+                    "household_interest_dividend_income",
+                    "household_social_security_pension_income",
+                ]
+            ],
+            on="person_household_id",
+            how="left",
+        )
+        receiver_data.drop("employment_income", axis=1, inplace=True)
 
-    receiver_data.rename(
-        columns={
-            "household_employment_income": "employment_income",
-            "household_interest_dividend_income": "interest_dividend_income",
-            "household_social_security_pension_income": "social_security_pension_income",
-        },
-        inplace=True,
-    )
+        receiver_data.rename(
+            columns={
+                "household_employment_income": "employment_income",
+                "household_interest_dividend_income": "interest_dividend_income",
+                "household_social_security_pension_income": "social_security_pension_income",
+            },
+            inplace=True,
+        )
 
-    # Add is_married variable for household heads based on raw person data
-    reference_persons = person_data[mask]
-    receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values
-    raw_data.close()
+        # Add is_married variable for household heads based on raw person data
+        reference_persons = person_data[mask]
+        receiver_data["is_married"] = reference_persons.A_MARITL.isin([1, 2]).values
 
     # Impute auto loan balance from the SCF
     from policyengine_us_data.datasets.scf.scf import SCF_2022
diff --git a/tests/unit/datasets/test_cps_file_handles.py b/tests/unit/datasets/test_cps_file_handles.py
@@ -11,6 +11,13 @@ def __init__(self, person: pd.DataFrame):
         self.person = person
         self.closed = False
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+        return False
+
     def close(self):
         self.closed = True