lincc-frameworks · Graciaaa3 · Jun 23, 2026 · May 22, 2026 · May 26, 2026 · May 26, 2026
diff --git a/src/hyrax/config_migrations/migrations/004_move_umap_to_reduce.py b/src/hyrax/config_migrations/migrations/004_move_umap_to_reduce.py
@@ -0,0 +1,77 @@
+"""Config migration: version 4 → version 5.
+
+Move the legacy ``[umap]`` and ``[umap.UMAP]`` to be under ``[reduce]`` table
+with ``[reduce.umap]`` and ``[reduce.umap.kwargs]``.
+"""
+
+import tomlkit
+from tomlkit.toml_document import TOMLDocument
+
+from hyrax.config_migrations.migration_utils import migration_step, move_key
+
+
+@migration_step(
+    from_version=4,
+    key_renames={
+        "umap.fit_sample_size": "reduce.umap.fit_sample_size",
+        "umap.model_path": "reduce.umap.model_path",
+        "umap.save_fit_umap": "reduce.save_fit_model",
+        "umap.parallel": "reduce.parallel",
+        "umap.UMAP": "reduce.umap.kwargs",
+    },
+)
+def move_umap_to_reduce(cfg: TOMLDocument) -> TOMLDocument:
+    """Move the legacy ``[umap]`` and ``[umap.UMAP]`` to be under ``[reduce]``."""
+    # Moving umap sections
+    umap_tbl = cfg.get("umap")
+    if not umap_tbl:
+        return cfg
+
+    # Ensure [reduce] exists
+    reduce_tbl = cfg.get("reduce")
+    if reduce_tbl is None:
+        reduce_tbl = tomlkit.table()
+        cfg["reduce"] = reduce_tbl
+
+    # Ensure [reduce.umap] exists
+    umap_reduce = reduce_tbl.get("umap")
+    if umap_reduce is None:
+        umap_reduce = tomlkit.table()
+        reduce_tbl["umap"] = umap_reduce
+
+    # under [reduce.umap]
+    move_key(cfg, "umap.fit_sample_size", "reduce.umap.fit_sample_size")
+    move_key(cfg, "umap.model_path", "reduce.umap.model_path")
+
+    # under [reduce]
+    reduce_tbl["batch_size"] = 1024
+    move_key(cfg, "umap.save_fit_umap", "reduce.save_fit_model")
+    move_key(cfg, "umap.parallel", "reduce.parallel")
+    if "name" in umap_tbl and umap_tbl["name"] == "umap.UMAP":
+        reduce_tbl["algorithm"] = "umap"
+
+    # Move umap.UMAP kwargs to reduce.umap.kwargs
+    move_key(cfg, "umap.UMAP", "reduce.umap.kwargs")
+
+    # Delete the old umap section
+    del cfg["umap"]
+
+    # Adding tsne section
+    reduce_tbl["tsne"] = tomlkit.table()
+
+    reduce_tbl["tsne"]["kwargs"] = tomlkit.table()
+    reduce_tbl["tsne"]["kwargs"]["n_components"] = 2
+    reduce_tbl["tsne"]["kwargs"]["perplexity"] = 30.0
+
+    # Adding pca section
+    reduce_tbl["pca"] = tomlkit.table()
+    reduce_tbl["pca"]["fit_sample_size"] = 1024
+    reduce_tbl["pca"]["model_path"] = False
+
+    reduce_tbl["pca"]["kwargs"] = tomlkit.table()
+    reduce_tbl["pca"]["kwargs"]["n_components"] = 2
+
+    if len(reduce_tbl):
+        cfg["reduce"] = reduce_tbl
+
+    return cfg
diff --git a/src/hyrax/hyrax_default_config.toml b/src/hyrax/hyrax_default_config.toml
@@ -423,24 +423,29 @@ vector_size = 64
 inference_dir = false
 
 
-[umap]
-# Number of data points used to fit the umap transform.
-fit_sample_size = 1024
+[reduce]
+# Name of the reduction algorithm to use
+algorithm = "umap"
 
-# Save the fitted umap as a pickle file 
-save_fit_umap = true
+# Save the fitted reducer model as a pickle file 
+save_fit_model = true
 
-# Path to a pre-existing umap reducer model
-model_path = false
+# The number of data points to use when transforming with reduction algorithm at once
+batch_size = 1024
 
-# Use multiprocessing during transforming to umap space (More memory intensive)
+# Use multiprocessing during transforming with redudction algorithm (More memory intensive)
 parallel = false
 
-# Name of the umap implementation to use
-name = "umap.UMAP"
+
+[reduce.umap]
+# Number of data points used to fit the umap model.
+fit_sample_size = 1024
+
+# Path to a pre-existing umap reducer model
+model_path = false
 
 
-[umap.UMAP]
+[reduce.umap.kwargs]
 # Specify any parameter accepted by https://umap-learn.readthedocs.io/en/latest/api.html#umap
 # Dimension of the embedded space
 n_components = 2
@@ -450,6 +455,34 @@ n_components = 2
 n_neighbors = 15
 
 
+[reduce.tsne]
+# Placeholder for config values of tsne model
+
+
+[reduce.tsne.kwargs]
+# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
+# Dimension of the embedded space
+n_components = 2
+
+# Number of nearest neighbors that is used in other manifold learning algorithms
+# See official documentation for details.
+perplexity = 30.0
+
+
+[reduce.pca]
+# Number of data points used to fit the pca model.
+fit_sample_size = 1024
+
+# Path to a pre-existing pca reducer model
+model_path = false
+
+
+[reduce.pca.kwargs]
+# Specify any parameter accepted by https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#
+# Dimension of the embedded space
+n_components=2
+
+
 [visualize]
 
 # List of metadata field names to use in visualizer. Must be available as metadata in your dataset

diff --git a/src/hyrax/verbs/__init__.py b/src/hyrax/verbs/__init__.py
@@ -14,6 +14,7 @@
 from hyrax.verbs.to_onnx import ToOnnx
 from hyrax.verbs.engine import Engine
 from hyrax.verbs.prepare import Prepare
+from hyrax.verbs.reduce_dimensions import ReduceDimensions
 from hyrax.verbs.create_splits import CreateSplits
 from hyrax.verbs.verb_registry import Verb
 from hyrax.verbs.verb_registry import all_class_verbs, all_verbs, fetch_verb_class, is_verb_class
@@ -32,6 +33,7 @@
     "Train",
     "Test",
     "SaveToDatabase",
+    "ReduceDimensions",
     "Verb",
     "DatabaseConnection",
     "Model",

diff --git a/src/hyrax/verbs/reduce_dimensions.py b/src/hyrax/verbs/reduce_dimensions.py
@@ -0,0 +1,167 @@
+import gc
+import logging
+import warnings
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+
+from .verb_registry import Verb, hyrax_verb
+
+logger = logging.getLogger(__name__)
+
+
+@hyrax_verb
+class ReduceDimensions(Verb):
+    """Verb to reduce the dimensionality of a dataset"""
+
+    # Use an attribute-friendly name so `hyrax.reduce_dimensions` resolves.
+    cli_name = "reduce_dimensions"
+    add_parser_kwargs = {}
+    description = "Reduce the dimensionality of a dataset using provided or default reduction algorithm."
+
+    @staticmethod
+    def setup_parser(parser: ArgumentParser):
+        """Setup parser for reduce-dimensions verb"""
+        parser.add_argument(
+            "-a",
+            "--algorithm",
+            type=str,
+            required=False,
+            help="Dimensionality reduction algorithm to use (default: umap).",
+        )
+        parser.add_argument(
+            "-i",
+            "--input-dir",
+            type=str,
+            required=False,
+            help="Directory containing the dataset to reduce dimensions for.",
+        )
+        parser.add_argument(
+            "-m",
+            "--model-path",
+            type=str,
+            required=False,
+            help="Path to a previously saved reducer model.",
+        )
+
+    def run_cli(self, args: Namespace | None = None):
+        """CLI stub for ReduceDimensions verb"""
+        logger.info("`reduce-dimensions` run from CLI.")
+
+        if args is None:
+            raise RuntimeError("Run CLI called with no arguments.")
+
+        return self.run(algorithm=args.algorithm, input_dir=args.input_dir, model_path=args.model_path)
+
+    def run(
+        self,
+        algorithm: str | None = None,
+        input_dir: Union[Path, str] | None = None,
+        model_path: Union[Path, str] | None = None,
+    ):
+        """
+        Run dimensionality reduction on a dataset
+
+        This method loads the latent space representations from an inference run and applies
+        the selected dimensionality reduction algorithm.
+
+        Algorithms that support reusable fitted models may either:
+
+        - fit a new model using a sampled subset of the data, or
+        - load an existing model if a model path is provided.
+
+        Algorithms without a separate fitting stage do not support model loading and
+        directly transform the input data.
+
+        The full dataset is then transformed into the target lower-dimensional space,
+        and the resulting embeddings are saved.
+
+        Parameters
+        ----------
+        algorithm : str, Optional
+            The dimensionality reduction algorithm to use.
+            If not specified, the method will look in the config for a default algorithm.
+
+        input_dir : str or Path, Optional
+            Directory containing the dataset to reduce dimensions for.
+
+        model_path : str or Path, Optional
+            Path to a previously saved reducer model.
+
+        Returns
+        -------
+        None
+            The method does not return anything but saves the algorithm reducer representations to disk.
+        """
+        with warnings.catch_warnings():
+            warnings.simplefilter(action="ignore", category=FutureWarning)
+            return self._run(algorithm, input_dir, model_path)
+
+    def _run(
+        self, algorithm: str | None, input_dir: Union[Path, str] | None, model_path: Union[Path, str] | None
+    ):
+        """See run()"""
+        from hyrax.config_utils import create_results_dir
+        from hyrax.datasets.result_factories import create_results_writer, load_results_dataset
+        from hyrax.verbs.reduction_algorithms.algorithm_registry import fetch_reducer_class
+
+        # Get reducer class
+        algorithm_name = algorithm or self.config["reduce"]["algorithm"]
+        reducer_cls = fetch_reducer_class(algorithm_name)
+
+        results_dir = create_results_dir(self.config, f"{algorithm_name}")
+        logger.info(f"Saving reduction results using {algorithm_name} to {results_dir}")
+        reduction_results = create_results_writer(results_dir)
+
+        algo_reducer = reducer_cls(self.config, reduction_results)
+
+        inference_results = load_results_dataset(self.config, results_dir=input_dir, verb="infer")
+        total_length = len(inference_results)
+
+        # Prepare data sample for either fitting a new model or validating a pre-trained model loaded.
+        config_sample_size = self.config["reduce"][algorithm_name].get("fit_sample_size", None)
+        sample_size = int(np.min([config_sample_size if config_sample_size else np.inf, total_length]))
+        rng = np.random.default_rng()
+        sample_indexes = rng.choice(np.arange(total_length), size=sample_size, replace=False)
+        data_sample = np.asarray(inference_results[sample_indexes]).reshape((sample_size, -1))
+
+        # Load model if path provided, otherwise fit new model
+        # Getting the model of current algorithm specified.
+        if model_path is None:
+            model_path = self.config["reduce"][algorithm_name].get("model_path", None)
+
+        if model_path:
+            logger.info(f"Loading pre-existing reducer model from {model_path}")
+            algo_reducer.load_model(data_sample.shape[1], model_path)
+        else:
+            logger.info("No model_path specified. A new model will be fitted.")
+            algo_reducer.fit(data_sample)
+
+            if self.config["reduce"].get("save_fit_model", False):
+                logger.info(f"Saving fitted {algorithm_name} reducer to result directory")
+                algo_reducer.save_model(results_dir)
+
+        del data_sample
+        gc.collect()
+
+        # Transform dataset
+        batch_size = self.config["reduce"]["batch_size"]
+        num_batches = int(np.ceil(total_length / batch_size))
+
+        all_indexes = np.arange(0, total_length)
+        all_ids = np.array(inference_results.ids())
+
+        args = (
+            (
+                all_ids[batch_indexes],
+                inference_results[batch_indexes].reshape(len(batch_indexes), -1),
+            )
+            for batch_indexes in np.array_split(all_indexes, num_batches)
+        )
+        algo_reducer.transform(args, num_batches)
+
+        logger.info(f"Finished transforming all data with {algorithm_name}")
+
+        return load_results_dataset(self.config, results_dir)
diff --git a/src/hyrax/verbs/reduction_algorithms/__init__.py b/src/hyrax/verbs/reduction_algorithms/__init__.py
@@ -0,0 +1,14 @@
+# Remove import sorting, these are imported in the order written so that
+# autoapi docs are generated with ordering controlled below.
+# ruff: noqa: I001
+from .algorithm_registry import ReductionAlgorithm
+from .umap import UMAP
+from .pca import PCA
+from .tsne import TSNE
+
+__all__ = [
+    "ReductionAlgorithm",
+    "UMAP",
+    "PCA",
+    "TSNE",
+]