Skip to content

feature

Feature representation and feature sets.

Feature dataclass

Represents a single feature with metadata.

Attributes:

Name Type Description
name str

Feature name

dtype FeatureType

Data type of feature

origin FeatureOrigin

How the feature was created

source_columns list

Original columns used to create this feature

transformation str

Description of transformation applied

explanation (str, optional)

Human-readable explanation of the feature

code (str, optional)

Python code that generates this feature

importance (float, optional)

Feature importance score

metadata dict

Additional metadata

Source code in featcopilot/core/feature.py
@dataclass
class Feature:
    """
    Represents a single feature with metadata.

    Attributes
    ----------
    name : str
        Feature name
    dtype : FeatureType
        Data type of feature
    origin : FeatureOrigin
        How the feature was created
    source_columns : list
        Original columns used to create this feature
    transformation : str
        Description of transformation applied
    explanation : str, optional
        Human-readable explanation of the feature
    code : str, optional
        Python code that generates this feature
    importance : float, optional
        Feature importance score
    metadata : dict
        Additional metadata
    """

    name: str
    dtype: FeatureType = FeatureType.NUMERIC
    origin: FeatureOrigin = FeatureOrigin.ORIGINAL
    source_columns: list[str] = field(default_factory=list)
    transformation: str = ""
    explanation: str | None = None
    code: str | None = None
    importance: float | None = None
    metadata: dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        if not self.source_columns:
            self.source_columns = [self.name]

    def to_dict(self) -> dict[str, Any]:
        """Convert feature to dictionary."""
        return {
            "name": self.name,
            "dtype": self.dtype.value,
            "origin": self.origin.value,
            "source_columns": self.source_columns,
            "transformation": self.transformation,
            "explanation": self.explanation,
            "code": self.code,
            "importance": self.importance,
            "metadata": self.metadata,
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "Feature":
        """Create feature from dictionary."""
        return cls(
            name=data["name"],
            dtype=FeatureType(data.get("dtype", "numeric")),
            origin=FeatureOrigin(data.get("origin", "original")),
            source_columns=data.get("source_columns", []),
            transformation=data.get("transformation", ""),
            explanation=data.get("explanation"),
            code=data.get("code"),
            importance=data.get("importance"),
            metadata=data.get("metadata", {}),
        )

    def compute(self, df: pd.DataFrame) -> pd.Series:
        """
        Compute feature values from DataFrame using stored code.

        The stored ``code`` is executed in a single shared namespace
        with ``df``, ``np`` and ``pd`` bound as names alongside a
        curated set of safe Python builtins (``len``, ``range``,
        ``sum``, numeric / sequence constructors, etc.) so common
        idioms work without giving the snippet a Python import system
        — ``__import__`` is intentionally NOT in the safe builtins, so
        an ``import foo`` statement inside the snippet raises at exec
        time. The snippet must bind its output to a name called
        ``result``.

        .. note::
           This is **not** a security sandbox for untrusted code.
           ``pd`` is in scope, which means the snippet can reach
           pandas' file I/O helpers (``pd.read_csv``, ``pd.read_parquet``,
           ``df.to_csv``, ...), and dunder attribute access on objects
           reachable from ``df`` / ``np`` / ``pd`` is not blocked. The
           builtin whitelist limits the *namespace* available to plain
           Python idioms; it does not isolate FeatCopilot from the
           ambient process. Stored snippets must therefore come from a
           trusted source (your own code generator, a vetted feature
           store, or a transform-rule registry you control).

        A *fresh copy* of the safe-builtins dict is passed into ``exec``
        on every call so that any mutation the snippet performs on
        ``__builtins__`` (rebinding entries, ``del``, ``pop``) does not
        bleed into subsequent ``compute`` calls. Likewise the
        data-bound namespace is constructed fresh per call. Using a
        SINGLE dict for both ``globals`` and ``locals`` is what makes
        free variables inside comprehensions and lambdas — which Python
        resolves against the enclosing function's globals, not the
        caller's locals — see ``df``, ``np`` and ``pd`` correctly.
        With separate ``locals`` and ``globals`` dicts a snippet such
        as ``[df['c'].iloc[i] for i in range(len(df))]`` would
        otherwise raise ``NameError`` because the implicit comprehension
        function's body looks ``df`` up in the (empty) ``globals``.

        Parameters
        ----------
        df : DataFrame
            Input data

        Returns
        -------
        Series
            Computed feature values

        Raises
        ------
        ValueError
            * If ``self.code`` is empty / missing — message
              ``"No code defined for feature ..."``.
            * If ``self.code`` is present but did not bind a
              ``result`` variable — message
              ``"Feature ... code did not produce a 'result' variable"``.
              These two cases produce DIFFERENT messages so a failing
              snippet is distinguishable from an unset feature when
              debugging.
        """
        if not self.code:
            raise ValueError(f"No code defined for feature {self.name}")

        # Single shared namespace so comprehensions / lambdas /
        # generator expressions inside the snippet see ``df``, ``np``,
        # ``pd`` and the safe builtins. Fresh dicts per call so the
        # snippet cannot pollute either the safe-builtins whitelist or
        # the data bindings for later ``compute`` invocations.
        namespace: dict[str, Any] = {
            "__builtins__": dict(_SAFE_BUILTINS),
            "df": df,
            "np": np,
            "pd": pd,
        }
        exec(self.code, namespace)
        if "result" not in namespace:
            raise ValueError(
                f"Feature {self.name!r} code did not produce a 'result' variable. "
                "Stored snippet must bind its output to a name called 'result'."
            )
        return namespace["result"]

compute(df)

Compute feature values from DataFrame using stored code.

The stored code is executed in a single shared namespace with df, np and pd bound as names alongside a curated set of safe Python builtins (len, range, sum, numeric / sequence constructors, etc.) so common idioms work without giving the snippet a Python import system — __import__ is intentionally NOT in the safe builtins, so an import foo statement inside the snippet raises at exec time. The snippet must bind its output to a name called result.

.. note:: This is not a security sandbox for untrusted code. pd is in scope, which means the snippet can reach pandas' file I/O helpers (pd.read_csv, pd.read_parquet, df.to_csv, ...), and dunder attribute access on objects reachable from df / np / pd is not blocked. The builtin whitelist limits the namespace available to plain Python idioms; it does not isolate FeatCopilot from the ambient process. Stored snippets must therefore come from a trusted source (your own code generator, a vetted feature store, or a transform-rule registry you control).

A fresh copy of the safe-builtins dict is passed into exec on every call so that any mutation the snippet performs on __builtins__ (rebinding entries, del, pop) does not bleed into subsequent compute calls. Likewise the data-bound namespace is constructed fresh per call. Using a SINGLE dict for both globals and locals is what makes free variables inside comprehensions and lambdas — which Python resolves against the enclosing function's globals, not the caller's locals — see df, np and pd correctly. With separate locals and globals dicts a snippet such as [df['c'].iloc[i] for i in range(len(df))] would otherwise raise NameError because the implicit comprehension function's body looks df up in the (empty) globals.

Parameters:

Name Type Description Default
df DataFrame

Input data

required

Returns:

Type Description
Series

Computed feature values

Raises:

Type Description
ValueError
  • If self.code is empty / missing — message "No code defined for feature ...".
  • If self.code is present but did not bind a result variable — message "Feature ... code did not produce a 'result' variable". These two cases produce DIFFERENT messages so a failing snippet is distinguishable from an unset feature when debugging.
Source code in featcopilot/core/feature.py
def compute(self, df: pd.DataFrame) -> pd.Series:
    """
    Compute feature values from DataFrame using stored code.

    The stored ``code`` is executed in a single shared namespace
    with ``df``, ``np`` and ``pd`` bound as names alongside a
    curated set of safe Python builtins (``len``, ``range``,
    ``sum``, numeric / sequence constructors, etc.) so common
    idioms work without giving the snippet a Python import system
    — ``__import__`` is intentionally NOT in the safe builtins, so
    an ``import foo`` statement inside the snippet raises at exec
    time. The snippet must bind its output to a name called
    ``result``.

    .. note::
       This is **not** a security sandbox for untrusted code.
       ``pd`` is in scope, which means the snippet can reach
       pandas' file I/O helpers (``pd.read_csv``, ``pd.read_parquet``,
       ``df.to_csv``, ...), and dunder attribute access on objects
       reachable from ``df`` / ``np`` / ``pd`` is not blocked. The
       builtin whitelist limits the *namespace* available to plain
       Python idioms; it does not isolate FeatCopilot from the
       ambient process. Stored snippets must therefore come from a
       trusted source (your own code generator, a vetted feature
       store, or a transform-rule registry you control).

    A *fresh copy* of the safe-builtins dict is passed into ``exec``
    on every call so that any mutation the snippet performs on
    ``__builtins__`` (rebinding entries, ``del``, ``pop``) does not
    bleed into subsequent ``compute`` calls. Likewise the
    data-bound namespace is constructed fresh per call. Using a
    SINGLE dict for both ``globals`` and ``locals`` is what makes
    free variables inside comprehensions and lambdas — which Python
    resolves against the enclosing function's globals, not the
    caller's locals — see ``df``, ``np`` and ``pd`` correctly.
    With separate ``locals`` and ``globals`` dicts a snippet such
    as ``[df['c'].iloc[i] for i in range(len(df))]`` would
    otherwise raise ``NameError`` because the implicit comprehension
    function's body looks ``df`` up in the (empty) ``globals``.

    Parameters
    ----------
    df : DataFrame
        Input data

    Returns
    -------
    Series
        Computed feature values

    Raises
    ------
    ValueError
        * If ``self.code`` is empty / missing — message
          ``"No code defined for feature ..."``.
        * If ``self.code`` is present but did not bind a
          ``result`` variable — message
          ``"Feature ... code did not produce a 'result' variable"``.
          These two cases produce DIFFERENT messages so a failing
          snippet is distinguishable from an unset feature when
          debugging.
    """
    if not self.code:
        raise ValueError(f"No code defined for feature {self.name}")

    # Single shared namespace so comprehensions / lambdas /
    # generator expressions inside the snippet see ``df``, ``np``,
    # ``pd`` and the safe builtins. Fresh dicts per call so the
    # snippet cannot pollute either the safe-builtins whitelist or
    # the data bindings for later ``compute`` invocations.
    namespace: dict[str, Any] = {
        "__builtins__": dict(_SAFE_BUILTINS),
        "df": df,
        "np": np,
        "pd": pd,
    }
    exec(self.code, namespace)
    if "result" not in namespace:
        raise ValueError(
            f"Feature {self.name!r} code did not produce a 'result' variable. "
            "Stored snippet must bind its output to a name called 'result'."
        )
    return namespace["result"]

from_dict(data) classmethod

Create feature from dictionary.

Source code in featcopilot/core/feature.py
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "Feature":
    """Create feature from dictionary."""
    return cls(
        name=data["name"],
        dtype=FeatureType(data.get("dtype", "numeric")),
        origin=FeatureOrigin(data.get("origin", "original")),
        source_columns=data.get("source_columns", []),
        transformation=data.get("transformation", ""),
        explanation=data.get("explanation"),
        code=data.get("code"),
        importance=data.get("importance"),
        metadata=data.get("metadata", {}),
    )

to_dict()

Convert feature to dictionary.

Source code in featcopilot/core/feature.py
def to_dict(self) -> dict[str, Any]:
    """Convert feature to dictionary."""
    return {
        "name": self.name,
        "dtype": self.dtype.value,
        "origin": self.origin.value,
        "source_columns": self.source_columns,
        "transformation": self.transformation,
        "explanation": self.explanation,
        "code": self.code,
        "importance": self.importance,
        "metadata": self.metadata,
    }

FeatureOrigin

Bases: Enum

Origin/source of feature.

Source code in featcopilot/core/feature.py
class FeatureOrigin(Enum):
    """Origin/source of feature."""

    ORIGINAL = "original"  # Original input feature
    POLYNOMIAL = "polynomial"  # Polynomial transformation
    INTERACTION = "interaction"  # Interaction between features
    AGGREGATION = "aggregation"  # Aggregation operation
    TIMESERIES = "timeseries"  # Time series extraction
    LLM_GENERATED = "llm_generated"  # Generated by LLM
    LLM_SUGGESTED = "llm_suggested"  # Suggested by LLM, implemented traditionally
    CUSTOM = "custom"  # Custom user-defined

FeatureSet

Collection of features with operations for manipulation.

Provides methods for adding, removing, filtering, and combining features.

Source code in featcopilot/core/feature.py
class FeatureSet:
    """
    Collection of features with operations for manipulation.

    Provides methods for adding, removing, filtering, and combining features.
    """

    def __init__(self, features: list[Feature] | None = None):
        self._features: dict[str, Feature] = {}
        if features:
            for f in features:
                self.add(f)

    def __len__(self) -> int:
        return len(self._features)

    def __iter__(self):
        return iter(self._features.values())

    def __contains__(self, name: str) -> bool:
        return name in self._features

    def __getitem__(self, name: str) -> Feature:
        return self._features[name]

    def add(self, feature: Feature) -> None:
        """Add a feature to the set."""
        self._features[feature.name] = feature

    def remove(self, name: str) -> Feature | None:
        """Remove and return a feature by name."""
        return self._features.pop(name, None)

    def get(self, name: str) -> Feature | None:
        """Get a feature by name."""
        return self._features.get(name)

    def get_names(self) -> list[str]:
        """Get all feature names."""
        return list(self._features.keys())

    def filter_by_origin(self, origin: FeatureOrigin) -> "FeatureSet":
        """Filter features by origin."""
        return FeatureSet([f for f in self._features.values() if f.origin == origin])

    def filter_by_type(self, dtype: FeatureType) -> "FeatureSet":
        """Filter features by data type."""
        return FeatureSet([f for f in self._features.values() if f.dtype == dtype])

    def filter_by_importance(self, min_importance: float) -> "FeatureSet":
        """Filter features by minimum importance."""
        return FeatureSet(
            [f for f in self._features.values() if f.importance is not None and f.importance >= min_importance]
        )

    def sort_by_importance(self, descending: bool = True) -> list[Feature]:
        """Sort features by importance."""
        features = [f for f in self._features.values() if f.importance is not None]
        return sorted(features, key=lambda f: f.importance or 0, reverse=descending)

    def merge(self, other: "FeatureSet") -> "FeatureSet":
        """Merge with another feature set."""
        result = FeatureSet(list(self._features.values()))
        for f in other:
            result.add(f)
        return result

    def to_dataframe(self) -> pd.DataFrame:
        """Convert feature set to DataFrame with metadata."""
        return pd.DataFrame([f.to_dict() for f in self._features.values()])

    def get_explanations(self) -> dict[str, str]:
        """Get explanations for all features that have them."""
        return {f.name: f.explanation for f in self._features.values() if f.explanation}

    def compute_all(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Compute all features that have code defined.

        Parameters
        ----------
        df : DataFrame
            Input data

        Returns
        -------
        DataFrame
            DataFrame with computed features
        """
        result = df.copy()
        for feature in self._features.values():
            if feature.code and feature.name not in result.columns:
                try:
                    result[feature.name] = feature.compute(df)
                except Exception as e:
                    # Log warning but continue
                    logger.warning(f"Could not compute feature {feature.name}: {e}")
        return result

add(feature)

Add a feature to the set.

Source code in featcopilot/core/feature.py
def add(self, feature: Feature) -> None:
    """Add a feature to the set."""
    self._features[feature.name] = feature

compute_all(df)

Compute all features that have code defined.

Parameters:

Name Type Description Default
df DataFrame

Input data

required

Returns:

Type Description
DataFrame

DataFrame with computed features

Source code in featcopilot/core/feature.py
def compute_all(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute all features that have code defined.

    Parameters
    ----------
    df : DataFrame
        Input data

    Returns
    -------
    DataFrame
        DataFrame with computed features
    """
    result = df.copy()
    for feature in self._features.values():
        if feature.code and feature.name not in result.columns:
            try:
                result[feature.name] = feature.compute(df)
            except Exception as e:
                # Log warning but continue
                logger.warning(f"Could not compute feature {feature.name}: {e}")
    return result

filter_by_importance(min_importance)

Filter features by minimum importance.

Source code in featcopilot/core/feature.py
def filter_by_importance(self, min_importance: float) -> "FeatureSet":
    """Filter features by minimum importance."""
    return FeatureSet(
        [f for f in self._features.values() if f.importance is not None and f.importance >= min_importance]
    )

filter_by_origin(origin)

Filter features by origin.

Source code in featcopilot/core/feature.py
def filter_by_origin(self, origin: FeatureOrigin) -> "FeatureSet":
    """Filter features by origin."""
    return FeatureSet([f for f in self._features.values() if f.origin == origin])

filter_by_type(dtype)

Filter features by data type.

Source code in featcopilot/core/feature.py
def filter_by_type(self, dtype: FeatureType) -> "FeatureSet":
    """Filter features by data type."""
    return FeatureSet([f for f in self._features.values() if f.dtype == dtype])

get(name)

Get a feature by name.

Source code in featcopilot/core/feature.py
def get(self, name: str) -> Feature | None:
    """Get a feature by name."""
    return self._features.get(name)

get_explanations()

Get explanations for all features that have them.

Source code in featcopilot/core/feature.py
def get_explanations(self) -> dict[str, str]:
    """Get explanations for all features that have them."""
    return {f.name: f.explanation for f in self._features.values() if f.explanation}

get_names()

Get all feature names.

Source code in featcopilot/core/feature.py
def get_names(self) -> list[str]:
    """Get all feature names."""
    return list(self._features.keys())

merge(other)

Merge with another feature set.

Source code in featcopilot/core/feature.py
def merge(self, other: "FeatureSet") -> "FeatureSet":
    """Merge with another feature set."""
    result = FeatureSet(list(self._features.values()))
    for f in other:
        result.add(f)
    return result

remove(name)

Remove and return a feature by name.

Source code in featcopilot/core/feature.py
def remove(self, name: str) -> Feature | None:
    """Remove and return a feature by name."""
    return self._features.pop(name, None)

sort_by_importance(descending=True)

Sort features by importance.

Source code in featcopilot/core/feature.py
def sort_by_importance(self, descending: bool = True) -> list[Feature]:
    """Sort features by importance."""
    features = [f for f in self._features.values() if f.importance is not None]
    return sorted(features, key=lambda f: f.importance or 0, reverse=descending)

to_dataframe()

Convert feature set to DataFrame with metadata.

Source code in featcopilot/core/feature.py
def to_dataframe(self) -> pd.DataFrame:
    """Convert feature set to DataFrame with metadata."""
    return pd.DataFrame([f.to_dict() for f in self._features.values()])

FeatureType

Bases: Enum

Types of features.

Source code in featcopilot/core/feature.py
class FeatureType(Enum):
    """Types of features."""

    NUMERIC = "numeric"
    CATEGORICAL = "categorical"
    DATETIME = "datetime"
    TEXT = "text"
    BOOLEAN = "boolean"