From e47d269448d7515bb96d7877deb2d66cb925662d Mon Sep 17 00:00:00 2001 From: Bernardo Braga Date: Tue, 6 Jan 2026 10:48:03 +0000 Subject: [PATCH 1/2] ensembling predictions made easy --- numerai_tools/scoring.py | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index 1de6d9b..dd75041 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -445,6 +445,57 @@ def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame: return power(gaussian(tie_kept_rank(df)), 1.5) +def ensemble_predictions(predictions: pd.DataFrame) -> pd.Series: + """Ensemble multiple prediction columns into a single prediction using equal weighting. + + This function ensures all predictors contribute equally by normalizing each prediction + through tie-kept ranking and gaussianization before taking the mean. The ensemble result + is then normalized using the same transformation pipeline (rank + gaussianize). + + The output is ready for further processing such as neutralization or scoring with + numerai_corr (which applies the power 1.5 transformation). + + This approach guarantees that: + 1. All predictions are on the same scale before ensembling + 2. Each predictor has equal weight regardless of its original distribution + 3. The output is properly normalized (mean≈0, std≈1) and ready for downstream operations + + Note: This function operates on a single time period (e.g., single era). When working + with multiple eras, apply this function separately to each era using groupby. + + Arguments: + predictions: pd.DataFrame - DataFrame where each column is a prediction to ensemble. + All columns should have the same index. + + Returns: + pd.Series - The ensembled prediction as a Series with the same index as input, + normalized through tie-kept ranking and gaussianization. + + Example: + >>> # Ensemble within each era + >>> ensemble = df.groupby('era', group_keys=False).apply( + ... lambda era_df: ensemble_predictions(era_df[pred_cols]) + ... ) + + >>> # Simple ensemble without era grouping + >>> ensemble = ensemble_predictions(predictions_df) + """ + assert isinstance(predictions, pd.DataFrame), "predictions must be a DataFrame" + assert len(predictions.columns) > 0, "predictions must have at least one column" + assert not predictions.isna().any().any(), "predictions contain NaNs" + + # Normalize each prediction to the same scale (mean≈0, std≈1) + normalized_preds = gaussian(tie_kept_rank(predictions)) + + # Take the mean of normalized predictions + ensemble = normalized_preds.mean(axis=1) + + # Normalize the ensemble result (rank + gaussianize produces mean≈0, std≈1) + ensemble_normalized = gaussian(tie_kept_rank(ensemble.to_frame())) + + return ensemble_normalized.iloc[:, 0] + + def tie_kept_rank__gaussianize__neutralize__variance_normalize( df: pd.DataFrame, neutralizers: pd.DataFrame ) -> pd.DataFrame: From f9becadf1fe5388b3c26724c92f0e55576f7ead7 Mon Sep 17 00:00:00 2001 From: Bernardo Braga Date: Tue, 6 Jan 2026 11:00:08 +0000 Subject: [PATCH 2/2] ensembling predictions with .mean() or .dot(weights) --- numerai_tools/scoring.py | 46 +++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index dd75041..49de5fe 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -445,19 +445,22 @@ def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame: return power(gaussian(tie_kept_rank(df)), 1.5) -def ensemble_predictions(predictions: pd.DataFrame) -> pd.Series: - """Ensemble multiple prediction columns into a single prediction using equal weighting. +def ensemble_predictions( + predictions: pd.DataFrame, weights: Optional[List[float]] = None +) -> pd.Series: + """Ensemble multiple prediction columns into a single prediction. - This function ensures all predictors contribute equally by normalizing each prediction - through tie-kept ranking and gaussianization before taking the mean. The ensemble result - is then normalized using the same transformation pipeline (rank + gaussianize). + This function ensures all predictors are normalized through tie-kept ranking and + gaussianization before aggregation. Supports both equal weighting (default) and + custom weights. The ensemble result is then normalized using the same transformation + pipeline (rank + gaussianize). The output is ready for further processing such as neutralization or scoring with - numerai_corr (which applies the power 1.5 transformation). + numerai_corr. This approach guarantees that: 1. All predictions are on the same scale before ensembling - 2. Each predictor has equal weight regardless of its original distribution + 2. Each predictor's contribution is proportional to its weight 3. The output is properly normalized (mean≈0, std≈1) and ready for downstream operations Note: This function operates on a single time period (e.g., single era). When working @@ -466,19 +469,26 @@ def ensemble_predictions(predictions: pd.DataFrame) -> pd.Series: Arguments: predictions: pd.DataFrame - DataFrame where each column is a prediction to ensemble. All columns should have the same index. + weights: Optional[List[float]] - Optional weights for each prediction column. + Must have same length as number of columns. + If None, uses equal weights (simple mean). + Weights will be normalized to sum to 1. Returns: pd.Series - The ensembled prediction as a Series with the same index as input, normalized through tie-kept ranking and gaussianization. Example: - >>> # Ensemble within each era + >>> # Equal weighting (simple mean) >>> ensemble = df.groupby('era', group_keys=False).apply( ... lambda era_df: ensemble_predictions(era_df[pred_cols]) ... ) - >>> # Simple ensemble without era grouping - >>> ensemble = ensemble_predictions(predictions_df) + >>> # Custom weights + >>> weights = [0.5, 0.3, 0.2] # Will be normalized to sum to 1 + >>> ensemble = df.groupby('era', group_keys=False).apply( + ... lambda era_df: ensemble_predictions(era_df[pred_cols], weights) + ... ) """ assert isinstance(predictions, pd.DataFrame), "predictions must be a DataFrame" assert len(predictions.columns) > 0, "predictions must have at least one column" @@ -487,8 +497,20 @@ def ensemble_predictions(predictions: pd.DataFrame) -> pd.Series: # Normalize each prediction to the same scale (mean≈0, std≈1) normalized_preds = gaussian(tie_kept_rank(predictions)) - # Take the mean of normalized predictions - ensemble = normalized_preds.mean(axis=1) + # Aggregate predictions + if weights is None: + # Equal weighting - simple mean + ensemble = normalized_preds.mean(axis=1) + else: + # Custom weights + assert len(weights) == len( + predictions.columns + ), f"weights length ({len(weights)}) must match number of columns ({len(predictions.columns)})" + # Normalize weights to sum to 1 + weights_array = np.array(weights) + weights_normalized = weights_array / weights_array.sum() + # Weighted average using dot product + ensemble = normalized_preds.dot(weights_normalized) # Normalize the ensemble result (rank + gaussianize produces mean≈0, std≈1) ensemble_normalized = gaussian(tie_kept_rank(ensemble.to_frame()))