Skip to content

redundancy

Redundancy elimination through correlation analysis.

RedundancyEliminator

Bases: BaseSelector

Eliminate redundant features based on correlation.

Removes highly correlated features, keeping the one with higher importance (if provided) or the first one.

Parameters:

Name Type Description Default
correlation_threshold float

Correlation threshold for redundancy

0.95
method str

Correlation method ('pearson', 'spearman', 'kendall')

'pearson'
original_features set[str]

Set of original feature names to prefer over derived features

None
original_preference float

Bonus added to importance scores of original features to prefer them

0.1

Examples:

>>> eliminator = RedundancyEliminator(correlation_threshold=0.95)
>>> X_reduced = eliminator.fit_transform(X, y)
Source code in featcopilot/selection/redundancy.py
class RedundancyEliminator(BaseSelector):
    """
    Eliminate redundant features based on correlation.

    Removes highly correlated features, keeping the one with
    higher importance (if provided) or the first one.

    Parameters
    ----------
    correlation_threshold : float, default=0.95
        Correlation threshold for redundancy
    method : str, default='pearson'
        Correlation method ('pearson', 'spearman', 'kendall')
    original_features : set[str], optional
        Set of original feature names to prefer over derived features
    original_preference : float, default=0.1
        Bonus added to importance scores of original features to prefer them

    Examples
    --------
    >>> eliminator = RedundancyEliminator(correlation_threshold=0.95)
    >>> X_reduced = eliminator.fit_transform(X, y)
    """

    def __init__(
        self,
        correlation_threshold: float = 0.95,
        method: str = "pearson",
        importance_scores: Optional[dict[str, float]] = None,
        original_features: Optional[set[str]] = None,
        original_preference: float = 0.1,
        verbose: bool = False,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.correlation_threshold = correlation_threshold
        self.method = method
        self.importance_scores = importance_scores or {}
        self.original_features = original_features or set()
        self.original_preference = original_preference
        self.verbose = verbose
        self._correlation_matrix: Optional[pd.DataFrame] = None

    def fit_transform(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Optional[Union[pd.Series, np.ndarray]] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """Fit and transform in one step (y is optional for this selector)."""
        return self.fit(X, y, **kwargs).transform(X, **kwargs)

    def fit(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Optional[Union[pd.Series, np.ndarray]] = None,
        importance_scores: Optional[dict[str, float]] = None,
        **kwargs,
    ) -> "RedundancyEliminator":
        """
        Fit eliminator by computing correlations.

        Parameters
        ----------
        X : DataFrame or ndarray
            Input features
        y : Series or ndarray, optional
            Target variable (unused)
        importance_scores : dict, optional
            Pre-computed importance scores

        Returns
        -------
        self : RedundancyEliminator
        """
        X = self._validate_input(X)

        if importance_scores:
            self.importance_scores = importance_scores

        # Compute correlation matrix (only for numeric columns)
        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

        self._correlation_matrix = X[numeric_cols].corr(method=self.method)

        # Find redundant features among numeric columns
        self._find_redundant_features(numeric_cols, non_numeric_cols)

        self._is_fitted = True
        return self

    def _find_redundant_features(self, columns: list[str], non_numeric_cols: list[str]) -> None:
        """Identify and mark redundant features for removal."""
        to_remove: set[str] = set()
        checked_pairs: set[tuple] = set()

        for i, col1 in enumerate(columns):
            if col1 in to_remove:
                continue

            for col2 in columns[i + 1 :]:
                if col2 in to_remove:
                    continue

                pair = tuple(sorted([col1, col2]))
                if pair in checked_pairs:
                    continue
                checked_pairs.add(pair)

                # Get correlation
                corr = abs(self._correlation_matrix.loc[col1, col2])

                if corr >= self.correlation_threshold:
                    # Decide which to remove based on importance + original feature preference
                    imp1 = self.importance_scores.get(col1, 0)
                    imp2 = self.importance_scores.get(col2, 0)

                    # Add preference bonus for original features
                    # This ensures original features are preferred over derived ones
                    is_orig1 = col1 in self.original_features
                    is_orig2 = col2 in self.original_features

                    if is_orig1 and not is_orig2:
                        # col1 is original, col2 is derived - prefer col1
                        imp1 += self.original_preference
                    elif is_orig2 and not is_orig1:
                        # col2 is original, col1 is derived - prefer col2
                        imp2 += self.original_preference

                    if imp1 >= imp2:
                        to_remove.add(col2)
                        if self.verbose:
                            orig_tag = " (derived)" if not is_orig2 else ""
                            logger.info(f"Removing {col2}{orig_tag} (corr={corr:.3f} with {col1})")
                    else:
                        to_remove.add(col1)
                        if self.verbose:
                            orig_tag = " (derived)" if not is_orig1 else ""
                            logger.info(f"Removing {col1}{orig_tag} (corr={corr:.3f} with {col2})")
                        break  # col1 is removed, move to next

        # Selected features are those not removed (numeric) plus all non-numeric columns
        # Non-numeric columns (categorical/text) are always preserved
        self._selected_features = [c for c in columns if c not in to_remove]
        self._selected_features.extend(non_numeric_cols)  # Always include non-numeric
        self._removed_features = list(to_remove)

        if self.verbose:
            logger.info(f"RedundancyEliminator: Removed {len(to_remove)} redundant features")

    def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
        """Remove redundant features."""
        if not self._is_fitted:
            raise RuntimeError("Eliminator must be fitted before transform")

        X = self._validate_input(X)

        # Keep selected features plus any non-numeric columns
        non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
        keep_cols = [c for c in self._selected_features if c in X.columns]
        keep_cols.extend([c for c in non_numeric if c not in keep_cols])

        return X[keep_cols]

    def get_removed_features(self) -> list[str]:
        """Get list of removed redundant features."""
        return getattr(self, "_removed_features", [])

    def get_correlation_matrix(self) -> Optional[pd.DataFrame]:
        """Get the computed correlation matrix."""
        return self._correlation_matrix

fit(X, y=None, importance_scores=None, **kwargs)

Fit eliminator by computing correlations.

Parameters:

Name Type Description Default
X DataFrame or ndarray

Input features

required
y Series or ndarray

Target variable (unused)

None
importance_scores dict

Pre-computed importance scores

None

Returns:

Name Type Description
self RedundancyEliminator
Source code in featcopilot/selection/redundancy.py
def fit(
    self,
    X: Union[pd.DataFrame, np.ndarray],
    y: Optional[Union[pd.Series, np.ndarray]] = None,
    importance_scores: Optional[dict[str, float]] = None,
    **kwargs,
) -> "RedundancyEliminator":
    """
    Fit eliminator by computing correlations.

    Parameters
    ----------
    X : DataFrame or ndarray
        Input features
    y : Series or ndarray, optional
        Target variable (unused)
    importance_scores : dict, optional
        Pre-computed importance scores

    Returns
    -------
    self : RedundancyEliminator
    """
    X = self._validate_input(X)

    if importance_scores:
        self.importance_scores = importance_scores

    # Compute correlation matrix (only for numeric columns)
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    self._correlation_matrix = X[numeric_cols].corr(method=self.method)

    # Find redundant features among numeric columns
    self._find_redundant_features(numeric_cols, non_numeric_cols)

    self._is_fitted = True
    return self

fit_transform(X, y=None, **kwargs)

Fit and transform in one step (y is optional for this selector).

Source code in featcopilot/selection/redundancy.py
def fit_transform(
    self,
    X: Union[pd.DataFrame, np.ndarray],
    y: Optional[Union[pd.Series, np.ndarray]] = None,
    **kwargs,
) -> pd.DataFrame:
    """Fit and transform in one step (y is optional for this selector)."""
    return self.fit(X, y, **kwargs).transform(X, **kwargs)

get_correlation_matrix()

Get the computed correlation matrix.

Source code in featcopilot/selection/redundancy.py
def get_correlation_matrix(self) -> Optional[pd.DataFrame]:
    """Get the computed correlation matrix."""
    return self._correlation_matrix

get_removed_features()

Get list of removed redundant features.

Source code in featcopilot/selection/redundancy.py
def get_removed_features(self) -> list[str]:
    """Get list of removed redundant features."""
    return getattr(self, "_removed_features", [])

transform(X, **kwargs)

Remove redundant features.

Source code in featcopilot/selection/redundancy.py
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
    """Remove redundant features."""
    if not self._is_fitted:
        raise RuntimeError("Eliminator must be fitted before transform")

    X = self._validate_input(X)

    # Keep selected features plus any non-numeric columns
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    keep_cols = [c for c in self._selected_features if c in X.columns]
    keep_cols.extend([c for c in non_numeric if c not in keep_cols])

    return X[keep_cols]