Skip to content

Commit 4e6ec07

Browse files
committed
Dry code; sync tests
1 parent fe0ccd8 commit 4e6ec07

1 file changed

Lines changed: 29 additions & 20 deletions

File tree

src/taxonopy/resolve_common_names.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
from pathlib import Path
88
import shutil
99

10+
from taxonopy.constants import TAXONOMIC_RANKS_BY_SPECIFICITY, INVALID_VALUES, TAXONOMIC_RANKS
11+
12+
# Module-level constant for join columns to avoid duplication
13+
PARENT_RANKS = TAXONOMIC_RANKS[:-1]
14+
1015
def download_and_extract_backbone(cache_dir: Path):
1116
"""Download and extract the GBIF backbone taxonomy files."""
1217
cache_dir.mkdir(parents=True, exist_ok=True)
@@ -110,7 +115,7 @@ def _normalize_one_column(col: str) -> pl.Expr:
110115
# Cast to string
111116
casted = pl.col(col).cast(pl.Utf8)
112117
# Turn "" into None
113-
cleaned = casted.map_elements(lambda x: None if x == "" else x, return_dtype=pl.Utf8)
118+
cleaned = casted.map_elements(lambda x: None if str(x).lower() in INVALID_VALUES else x, return_dtype=pl.Utf8)
114119
# Give it back its original name
115120
return cleaned.alias(col)
116121

@@ -130,10 +135,6 @@ def normalize_taxonomic_columns(df: pl.DataFrame) -> pl.DataFrame:
130135
# Apply them all at once
131136
return df.with_columns(exprs)
132137

133-
# Module-level constant for join columns to avoid duplication
134-
TAXONOMIC_HIERARCHY = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus']
135-
136-
137138
def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) -> pl.DataFrame:
138139
"""
139140
Join annotation dataframe with taxon dataframe for a single taxonomic rank.
@@ -147,18 +148,25 @@ def join_single_rank(anno_df: pl.DataFrame, taxon_df: pl.DataFrame, rank: str) -
147148
return anno_df
148149

149150
# Figure out which higher-rank cols we actually have in the anno_df
150-
join_cols = [col for col in TAXONOMIC_HIERARCHY if col in anno_df.columns]
151-
152-
# Select, rename, *and* drop duplicate backbone rows on the full key
151+
join_cols = [c for c in PARENT_RANKS
152+
if c in anno_df.columns and c != rank]
153+
154+
# Select, rename, and drop duplicate backbone rows on the full key
155+
# - if the taxon_df actually has a taxonRank column, filter by it;
156+
# - otherwise just use the whole table
157+
if "taxonRank" in taxon_df.columns:
158+
candidate = taxon_df.filter(pl.col("taxonRank") == rank)
159+
else:
160+
candidate = taxon_df
161+
153162
backbone_subset = (
154-
taxon_df
155-
.select([
156-
'canonicalName',
157-
pl.col('taxonID').alias(f'taxonID_{rank}'),
163+
candidate
164+
.select(
165+
pl.col("canonicalName"),
166+
pl.col("taxonID").alias(f"taxonID_{rank}"),
158167
*join_cols
159-
])
160-
# ensure (canonicalName + all join_cols) is unique
161-
.unique(subset=['canonicalName'] + join_cols)
168+
)
169+
.unique(subset=["canonicalName"] + join_cols)
162170
)
163171

164172
result = anno_df.join(
@@ -263,8 +271,9 @@ def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup:
263271
:param common_lookup: Common name lookup table with (taxonID, common_name) columns
264272
:return: DataFrame with common_name column populated using hierarchical fallback
265273
"""
266-
rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom']
267-
274+
# Define hierarchical order of taxonomic ranks (map class_ to class)
275+
rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY]
276+
268277
# Initialize common_name column
269278
result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name"))
270279

@@ -340,9 +349,9 @@ def merge_common_name(anno_df, common_name_df, taxon_df):
340349
.agg(pl.col("vernacularName").first().alias("common_name"))
341350
)
342351

343-
# Define hierarchical order of taxonomic ranks
344-
rank_columns = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom']
345-
352+
# Define hierarchical order of taxonomic ranks (map class_ to class)
353+
rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY]
354+
346355
# Find which taxonomic classification columns we have
347356
available_rank_cols = [rank for rank in rank_columns
348357
if rank in new_anno_df.columns]

0 commit comments

Comments
 (0)