Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2a02952
adding reduce_dimensions outline
Graciaaa3 May 22, 2026
1bd1eda
clean up structure for umap, currently working
Graciaaa3 May 26, 2026
53dfc76
removing redundant reduction_results parameter
Graciaaa3 May 26, 2026
7ec74a2
update umap test to use reduce_dimensions
Graciaaa3 May 28, 2026
d8d4840
Revert "update umap test to use reduce_dimensions"
Graciaaa3 May 29, 2026
0debc50
update model path to be algorithm specific
Graciaaa3 May 29, 2026
7cda6c4
adding reduce_dimensions tests
Graciaaa3 May 29, 2026
51b4345
Merge branch 'main' into reduce_dimensions
Graciaaa3 May 29, 2026
0c2704c
adding pca implementation
Graciaaa3 May 29, 2026
2a281ca
updating reduce_dimensions tests
Graciaaa3 May 29, 2026
7d83119
update tests and load_model for pca to use correct attributes
Graciaaa3 Jun 2, 2026
c395411
adding tsne implementation
Graciaaa3 Jun 4, 2026
5227c62
factor helper methods to base class and update docstring
Graciaaa3 Jun 5, 2026
8a85644
factoring model validate helper function from load_model
Graciaaa3 Jun 5, 2026
2bd7f25
adding tsne test
Graciaaa3 Jun 5, 2026
0621c94
Merge branch 'main' into reduce_dimensions
Graciaaa3 Jun 18, 2026
9c07056
docstring fix for pca and tsne
Graciaaa3 Jun 18, 2026
522a042
deprecating old umap verb
Graciaaa3 Jun 18, 2026
a3f9704
moving 'parallel' config keyword to upper level
Graciaaa3 Jun 22, 2026
627ca1c
adding config migration 004 and tests
Graciaaa3 Jun 22, 2026
8ffcb8f
Merge branch 'main' into reduce_dimensions
Graciaaa3 Jun 22, 2026
3f36adc
format fix
Graciaaa3 Jun 22, 2026
84b8658
Merge branch 'main' into reduce_dimensions
Graciaaa3 Jun 23, 2026
dcce2f0
docstring and warning message fix
Graciaaa3 Jun 23, 2026
64342ab
comment and test fix
Graciaaa3 Jun 23, 2026
6dcacbb
Merge branch 'main' into reduce_dimensions
Graciaaa3 Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions src/hyrax/config_migrations/migrations/004_move_umap_to_reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Config migration: version 4 → version 5.

Move the legacy ``[umap]`` and ``[umap.UMAP]`` to be under ``[reduce]`` table
with ``[reduce.umap]`` and ``[reduce.umap.kwargs]``.
"""

import tomlkit
from tomlkit.toml_document import TOMLDocument

from hyrax.config_migrations.migration_utils import migration_step, move_key


@migration_step(
from_version=4,
key_renames={
"umap.fit_sample_size": "reduce.umap.fit_sample_size",
"umap.model_path": "reduce.umap.model_path",
"umap.save_fit_umap": "reduce.save_fit_model",
"umap.parallel": "reduce.parallel",
"umap.UMAP": "reduce.umap.kwargs",
},
)
def move_umap_to_reduce(cfg: TOMLDocument) -> TOMLDocument:
"""Move the legacy ``[umap]`` and ``[umap.UMAP]`` to be under ``[reduce]``."""
# Moving umap sections
umap_tbl = cfg.get("umap")
if not umap_tbl:
return cfg

# Ensure [reduce] exists
reduce_tbl = cfg.get("reduce")
if reduce_tbl is None:
reduce_tbl = tomlkit.table()
cfg["reduce"] = reduce_tbl

# Ensure [reduce.umap] exists
umap_reduce = reduce_tbl.get("umap")
if umap_reduce is None:
umap_reduce = tomlkit.table()
reduce_tbl["umap"] = umap_reduce

# under [reduce.umap]
move_key(cfg, "umap.fit_sample_size", "reduce.umap.fit_sample_size")
move_key(cfg, "umap.model_path", "reduce.umap.model_path")

# under [reduce]
reduce_tbl["batch_size"] = 1024
move_key(cfg, "umap.save_fit_umap", "reduce.save_fit_model")
move_key(cfg, "umap.parallel", "reduce.parallel")
if "name" in umap_tbl and umap_tbl["name"] == "umap.UMAP":
reduce_tbl["algorithm"] = "umap"

# Move umap.UMAP kwargs to reduce.umap.kwargs
move_key(cfg, "umap.UMAP", "reduce.umap.kwargs")
Comment thread
Graciaaa3 marked this conversation as resolved.

# Delete the old umap section
del cfg["umap"]

# Adding tsne section
reduce_tbl["tsne"] = tomlkit.table()

reduce_tbl["tsne"]["kwargs"] = tomlkit.table()
reduce_tbl["tsne"]["kwargs"]["n_components"] = 2
reduce_tbl["tsne"]["kwargs"]["perplexity"] = 30.0

# Adding pca section
reduce_tbl["pca"] = tomlkit.table()
reduce_tbl["pca"]["fit_sample_size"] = 1024
reduce_tbl["pca"]["model_path"] = False

reduce_tbl["pca"]["kwargs"] = tomlkit.table()
reduce_tbl["pca"]["kwargs"]["n_components"] = 2

if len(reduce_tbl):
cfg["reduce"] = reduce_tbl

return cfg
55 changes: 44 additions & 11 deletions src/hyrax/hyrax_default_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -423,24 +423,29 @@ vector_size = 64
inference_dir = false


[umap]
# Number of data points used to fit the umap transform.
fit_sample_size = 1024
[reduce]
# Name of the reduction algorithm to use
algorithm = "umap"

# Save the fitted umap as a pickle file
save_fit_umap = true
# Save the fitted reducer model as a pickle file
save_fit_model = true

# Path to a pre-existing umap reducer model
model_path = false
# The number of data points to use when transforming with reduction algorithm at once
batch_size = 1024

# Use multiprocessing during transforming to umap space (More memory intensive)
# Use multiprocessing during transforming with redudction algorithm (More memory intensive)
parallel = false

# Name of the umap implementation to use
name = "umap.UMAP"

[reduce.umap]
# Number of data points used to fit the umap model.
fit_sample_size = 1024

# Path to a pre-existing umap reducer model
model_path = false


[umap.UMAP]
[reduce.umap.kwargs]
# Specify any parameter accepted by https://umap-learn.readthedocs.io/en/latest/api.html#umap
# Dimension of the embedded space
n_components = 2
Expand All @@ -450,6 +455,34 @@ n_components = 2
n_neighbors = 15


[reduce.tsne]
# Placeholder for config values of tsne model


[reduce.tsne.kwargs]
# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Dimension of the embedded space
n_components = 2

# Number of nearest neighbors that is used in other manifold learning algorithms
# See official documentation for details.
perplexity = 30.0


[reduce.pca]
# Number of data points used to fit the pca model.
fit_sample_size = 1024

# Path to a pre-existing pca reducer model
model_path = false


[reduce.pca.kwargs]
# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#
# Dimension of the embedded space
n_components=2


[visualize]

# List of metadata field names to use in visualizer. Must be available as metadata in your dataset
Expand Down
2 changes: 2 additions & 0 deletions src/hyrax/verbs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from hyrax.verbs.to_onnx import ToOnnx
from hyrax.verbs.engine import Engine
from hyrax.verbs.prepare import Prepare
from hyrax.verbs.reduce_dimensions import ReduceDimensions
from hyrax.verbs.create_splits import CreateSplits
from hyrax.verbs.verb_registry import Verb
from hyrax.verbs.verb_registry import all_class_verbs, all_verbs, fetch_verb_class, is_verb_class
Expand All @@ -32,6 +33,7 @@
"Train",
"Test",
"SaveToDatabase",
"ReduceDimensions",
"Verb",
"DatabaseConnection",
"Model",
Expand Down
167 changes: 167 additions & 0 deletions src/hyrax/verbs/reduce_dimensions.py
Comment thread
Graciaaa3 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import gc
import logging
import warnings
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Union

import numpy as np

from .verb_registry import Verb, hyrax_verb

logger = logging.getLogger(__name__)


@hyrax_verb
class ReduceDimensions(Verb):
"""Verb to reduce the dimensionality of a dataset"""

# Use an attribute-friendly name so `hyrax.reduce_dimensions` resolves.
cli_name = "reduce_dimensions"
add_parser_kwargs = {}
description = "Reduce the dimensionality of a dataset using provided or default reduction algorithm."

@staticmethod
def setup_parser(parser: ArgumentParser):
"""Setup parser for reduce-dimensions verb"""
parser.add_argument(
"-a",
"--algorithm",
type=str,
required=False,
help="Dimensionality reduction algorithm to use (default: umap).",
)
parser.add_argument(
"-i",
"--input-dir",
type=str,
required=False,
help="Directory containing the dataset to reduce dimensions for.",
)
parser.add_argument(
"-m",
"--model-path",
type=str,
required=False,
help="Path to a previously saved reducer model.",
)

def run_cli(self, args: Namespace | None = None):
"""CLI stub for ReduceDimensions verb"""
logger.info("`reduce-dimensions` run from CLI.")

if args is None:
raise RuntimeError("Run CLI called with no arguments.")

return self.run(algorithm=args.algorithm, input_dir=args.input_dir, model_path=args.model_path)

def run(
self,
algorithm: str | None = None,
input_dir: Union[Path, str] | None = None,
model_path: Union[Path, str] | None = None,
):
"""
Run dimensionality reduction on a dataset

This method loads the latent space representations from an inference run and applies
the selected dimensionality reduction algorithm.

Algorithms that support reusable fitted models may either:

- fit a new model using a sampled subset of the data, or
- load an existing model if a model path is provided.

Algorithms without a separate fitting stage do not support model loading and
directly transform the input data.

The full dataset is then transformed into the target lower-dimensional space,
and the resulting embeddings are saved.

Parameters
----------
algorithm : str, Optional
The dimensionality reduction algorithm to use.
If not specified, the method will look in the config for a default algorithm.

input_dir : str or Path, Optional
Directory containing the dataset to reduce dimensions for.

model_path : str or Path, Optional
Path to a previously saved reducer model.

Returns
-------
None
The method does not return anything but saves the algorithm reducer representations to disk.
"""
with warnings.catch_warnings():
warnings.simplefilter(action="ignore", category=FutureWarning)
return self._run(algorithm, input_dir, model_path)

def _run(
self, algorithm: str | None, input_dir: Union[Path, str] | None, model_path: Union[Path, str] | None
):
"""See run()"""
from hyrax.config_utils import create_results_dir
from hyrax.datasets.result_factories import create_results_writer, load_results_dataset
from hyrax.verbs.reduction_algorithms.algorithm_registry import fetch_reducer_class

# Get reducer class
algorithm_name = algorithm or self.config["reduce"]["algorithm"]
reducer_cls = fetch_reducer_class(algorithm_name)

results_dir = create_results_dir(self.config, f"{algorithm_name}")
logger.info(f"Saving reduction results using {algorithm_name} to {results_dir}")
reduction_results = create_results_writer(results_dir)

algo_reducer = reducer_cls(self.config, reduction_results)

inference_results = load_results_dataset(self.config, results_dir=input_dir, verb="infer")
total_length = len(inference_results)

# Prepare data sample for either fitting a new model or validating a pre-trained model loaded.
config_sample_size = self.config["reduce"][algorithm_name].get("fit_sample_size", None)
sample_size = int(np.min([config_sample_size if config_sample_size else np.inf, total_length]))
rng = np.random.default_rng()
sample_indexes = rng.choice(np.arange(total_length), size=sample_size, replace=False)
data_sample = np.asarray(inference_results[sample_indexes]).reshape((sample_size, -1))

# Load model if path provided, otherwise fit new model
# Getting the model of current algorithm specified.
if model_path is None:
model_path = self.config["reduce"][algorithm_name].get("model_path", None)

if model_path:
logger.info(f"Loading pre-existing reducer model from {model_path}")
algo_reducer.load_model(data_sample.shape[1], model_path)
else:
logger.info("No model_path specified. A new model will be fitted.")
algo_reducer.fit(data_sample)

if self.config["reduce"].get("save_fit_model", False):
logger.info(f"Saving fitted {algorithm_name} reducer to result directory")
algo_reducer.save_model(results_dir)

del data_sample
gc.collect()

# Transform dataset
batch_size = self.config["reduce"]["batch_size"]
num_batches = int(np.ceil(total_length / batch_size))

all_indexes = np.arange(0, total_length)
all_ids = np.array(inference_results.ids())

args = (
(
all_ids[batch_indexes],
inference_results[batch_indexes].reshape(len(batch_indexes), -1),
)
for batch_indexes in np.array_split(all_indexes, num_batches)
)
algo_reducer.transform(args, num_batches)

logger.info(f"Finished transforming all data with {algorithm_name}")

return load_results_dataset(self.config, results_dir)
14 changes: 14 additions & 0 deletions src/hyrax/verbs/reduction_algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Remove import sorting, these are imported in the order written so that
# autoapi docs are generated with ordering controlled below.
# ruff: noqa: I001
from .algorithm_registry import ReductionAlgorithm
from .umap import UMAP
from .pca import PCA
from .tsne import TSNE

__all__ = [
"ReductionAlgorithm",
"UMAP",
"PCA",
"TSNE",
]
Loading
Loading