Best Practices¶
Guidelines for getting the most out of FeatCopilot.
Data Preparation¶
Handle Missing Values First¶
# FeatCopilot handles NaN in generated features,
# but input data should be reasonably clean
X = X.dropna() # or
X = X.fillna(X.median())
Identify Column Types¶
# Numeric columns are processed by TabularEngine
numeric_cols = X.select_dtypes(include=['number']).columns
# Text columns are processed by TextEngine
text_cols = X.select_dtypes(include=['object']).columns
Scale Appropriately¶
# Feature engineering before scaling is usually better
# Scale after feature generation if needed
from sklearn.preprocessing import StandardScaler
engineer = AutoFeatureEngineer(engines=['tabular'])
X_fe = engineer.fit_transform(X, y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_fe.fillna(0))
Feature Engineering¶
Start Simple¶
# Start with tabular engine only
engineer = AutoFeatureEngineer(
engines=['tabular'],
max_features=30
)
# Add complexity if needed
engineer = AutoFeatureEngineer(
engines=['tabular', 'timeseries', 'llm'],
max_features=50
)
Limit Feature Explosion¶
# Too many features can cause:
# - Overfitting
# - Slow training
# - Memory issues
engineer = AutoFeatureEngineer(
engines=['tabular'],
max_features=50, # Reasonable limit
correlation_threshold=0.95 # Remove redundant
)
Use Domain Knowledge¶
# LLM features work best with context
engineer = AutoFeatureEngineer(
engines=['llm'],
llm_config={
'domain': 'healthcare', # Specify domain
'max_suggestions': 15
}
)
X_fe = engineer.fit_transform(
X, y,
column_descriptions={...}, # Describe columns
task_description="..." # Describe task
)
Model Integration¶
Sklearn Pipeline¶
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# Handle the full pipeline
pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('features', AutoFeatureEngineer(engines=['tabular'], max_features=30)),
('model', RandomForestClassifier())
])
# Note: AutoFeatureEngineer outputs may contain NaN
# Add imputation after if needed
Train/Test Split¶
# Always split BEFORE feature engineering
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Fit on training data only
engineer = AutoFeatureEngineer(engines=['tabular'])
X_train_fe = engineer.fit_transform(X_train, y_train)
# Transform test data (no y needed)
X_test_fe = engineer.transform(X_test)
# Align columns
common_cols = [c for c in X_train_fe.columns if c in X_test_fe.columns]
X_train_fe = X_train_fe[common_cols]
X_test_fe = X_test_fe[common_cols]
Cross-Validation¶
from sklearn.model_selection import cross_val_score
# Feature engineering inside CV to prevent leakage
def feature_engineer_cv(X, y, cv=5):
scores = []
for train_idx, val_idx in KFold(cv).split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
engineer = AutoFeatureEngineer(engines=['tabular'])
X_train_fe = engineer.fit_transform(X_train, y_train)
X_val_fe = engineer.transform(X_val)
# ... train and evaluate model
return scores
LLM Features¶
Provide Rich Context¶
# ❌ Poor context
engineer.fit_transform(X, y)
# ✅ Rich context
engineer.fit_transform(
X, y,
column_descriptions={
'col1': 'Detailed description...',
'col2': 'Another description...'
},
task_description="""
Detailed task description including:
- What we're predicting
- Business context
- Important considerations
"""
)
Review Generated Code¶
# Always review before production
for name, code in engineer.get_feature_code().items():
print(f"# {name}")
print(code)
# Check for:
# - Division by zero handling
# - Edge cases
# - Correct column references
Validate Features¶
# Enable validation (default)
llm_config = {'validate_features': True}
# Check for invalid features in verbose mode
engineer = AutoFeatureEngineer(
engines=['llm'],
llm_config=llm_config,
verbose=True # See validation results
)
Performance¶
Large Datasets¶
# For large datasets, limit features aggressively
engineer = AutoFeatureEngineer(
engines=['tabular'],
max_features=30, # Keep it small
)
# Or sample for fitting
sample_idx = np.random.choice(len(X), 10000, replace=False)
engineer.fit(X.iloc[sample_idx], y.iloc[sample_idx])
X_fe = engineer.transform(X) # Transform full data
Memory Management¶
# Process in batches for very large data
def batch_transform(engineer, X, batch_size=10000):
results = []
for i in range(0, len(X), batch_size):
batch = X.iloc[i:i+batch_size]
results.append(engineer.transform(batch))
return pd.concat(results)
Caching¶
from featcopilot.utils import FeatureCache
# Cache expensive computations
cache = FeatureCache(cache_dir='.feature_cache')
cache_key = 'my_features'
if cache.has(cache_key):
X_fe = cache.get(cache_key)
else:
X_fe = engineer.fit_transform(X, y)
cache.set(cache_key, X_fe)
Debugging¶
Verbose Mode¶
Check Generated Features¶
# See what was generated
print(f"Features generated: {len(engineer.get_feature_names())}")
print(engineer.get_feature_names()[:10])
# Check for issues
X_fe = engineer.transform(X)
print(f"NaN count: {X_fe.isna().sum().sum()}")
print(f"Inf count: {np.isinf(X_fe.select_dtypes('number')).sum().sum()}")
Feature Statistics¶
Common Pitfalls¶
1. Data Leakage¶
# ❌ Wrong: fitting on all data
engineer.fit_transform(X, y) # Before split!
# ✅ Correct: fit on train only
X_train, X_test = train_test_split(X)
engineer.fit(X_train, y_train)
2. Feature Explosion¶
# ❌ Too many features
engineer = AutoFeatureEngineer(
polynomial_degree=4, # 4th degree polynomials
max_features=None # No limit
)
# ✅ Controlled generation
engineer = AutoFeatureEngineer(
polynomial_degree=2,
max_features=50
)
3. Ignoring NaN Values¶
# Generated features may have NaN
X_fe = engineer.fit_transform(X, y)
# Always handle before modeling
X_fe = X_fe.fillna(0) # or
X_fe = X_fe.fillna(X_fe.median())