77from pathlib import Path
88import shutil
99
10+ from taxonopy .constants import TAXONOMIC_RANKS_BY_SPECIFICITY , INVALID_VALUES , TAXONOMIC_RANKS
11+
12+ # Module-level constant for join columns to avoid duplication
13+ PARENT_RANKS = TAXONOMIC_RANKS [:- 1 ]
14+
1015def download_and_extract_backbone (cache_dir : Path ):
1116 """Download and extract the GBIF backbone taxonomy files."""
1217 cache_dir .mkdir (parents = True , exist_ok = True )
@@ -110,7 +115,7 @@ def _normalize_one_column(col: str) -> pl.Expr:
110115 # Cast to string
111116 casted = pl .col (col ).cast (pl .Utf8 )
112117 # Turn "" into None
113- cleaned = casted .map_elements (lambda x : None if x == "" else x , return_dtype = pl .Utf8 )
118+ cleaned = casted .map_elements (lambda x : None if str ( x ). lower () in INVALID_VALUES else x , return_dtype = pl .Utf8 )
114119 # Give it back its original name
115120 return cleaned .alias (col )
116121
@@ -130,10 +135,6 @@ def normalize_taxonomic_columns(df: pl.DataFrame) -> pl.DataFrame:
130135 # Apply them all at once
131136 return df .with_columns (exprs )
132137
133- # Module-level constant for join columns to avoid duplication
134- TAXONOMIC_HIERARCHY = ['kingdom' , 'phylum' , 'class' , 'order' , 'family' , 'genus' ]
135-
136-
137138def join_single_rank (anno_df : pl .DataFrame , taxon_df : pl .DataFrame , rank : str ) -> pl .DataFrame :
138139 """
139140 Join annotation dataframe with taxon dataframe for a single taxonomic rank.
@@ -147,18 +148,25 @@ def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) -
147148 return anno_df
148149
149150 # Figure out which higher-rank cols we actually have in the anno_df
150- join_cols = [col for col in TAXONOMIC_HIERARCHY if col in anno_df .columns ]
151-
152- # Select, rename, *and* drop duplicate backbone rows on the full key
151+ join_cols = [c for c in PARENT_RANKS
152+ if c in anno_df .columns and c != rank ]
153+
154+ # Select, rename, and drop duplicate backbone rows on the full key
155+ # - if the taxon_df actually has a taxonRank column, filter by it;
156+ # - otherwise just use the whole table
157+ if "taxonRank" in taxon_df .columns :
158+ candidate = taxon_df .filter (pl .col ("taxonRank" ) == rank )
159+ else :
160+ candidate = taxon_df
161+
153162 backbone_subset = (
154- taxon_df
155- .select ([
156- ' canonicalName' ,
157- pl .col (' taxonID' ).alias (f' taxonID_{ rank } ' ),
163+ candidate
164+ .select (
165+ pl . col ( " canonicalName" ) ,
166+ pl .col (" taxonID" ).alias (f" taxonID_{ rank } " ),
158167 * join_cols
159- ])
160- # ensure (canonicalName + all join_cols) is unique
161- .unique (subset = ['canonicalName' ] + join_cols )
168+ )
169+ .unique (subset = ["canonicalName" ] + join_cols )
162170 )
163171
164172 result = anno_df .join (
@@ -263,8 +271,9 @@ def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup:
263271 :param common_lookup: Common name lookup table with (taxonID, common_name) columns
264272 :return: DataFrame with common_name column populated using hierarchical fallback
265273 """
266- rank_columns = ['species' , 'genus' , 'family' , 'order' , 'class' , 'phylum' , 'kingdom' ]
267-
274+ # Define hierarchical order of taxonomic ranks (map class_ to class)
275+ rank_columns = [r .rstrip ('_' ) for r in TAXONOMIC_RANKS_BY_SPECIFICITY ]
276+
268277 # Initialize common_name column
269278 result_df = anno_df .with_columns (pl .lit (None ).cast (pl .Utf8 ).alias ("common_name" ))
270279
@@ -340,9 +349,9 @@ def merge_common_name(anno_df, common_name_df, taxon_df):
340349 .agg (pl .col ("vernacularName" ).first ().alias ("common_name" ))
341350 )
342351
343- # Define hierarchical order of taxonomic ranks
344- rank_columns = ['species' , 'genus' , 'family' , 'order' , 'class' , 'phylum' , 'kingdom' ]
345-
352+ # Define hierarchical order of taxonomic ranks (map class_ to class)
353+ rank_columns = [r . rstrip ( '_' ) for r in TAXONOMIC_RANKS_BY_SPECIFICITY ]
354+
346355 # Find which taxonomic classification columns we have
347356 available_rank_cols = [rank for rank in rank_columns
348357 if rank in new_anno_df .columns ]
0 commit comments