Разработка системы feature engineering для крипто-данных
Feature engineering — процесс создания информативных признаков из сырых данных для ML моделей. В крипто-трейдинге это критически важный этап: правильные features могут удвоить качество модели. Система автоматизирует создание, валидацию и отбор features.
Категории features
Price-based features:
import pandas as pd
import numpy as np
import talib
def create_price_features(df, symbol='BTC'):
f = pd.DataFrame(index=df.index)
# Multi-period returns
for period in [1, 2, 4, 8, 12, 24, 48, 72, 168]:
f[f'ret_{period}h'] = df['close'].pct_change(period)
f[f'log_ret_{period}h'] = np.log(df['close']).diff(period)
# Rolling statistics of returns
for window in [12, 24, 72, 168]:
rets = df['close'].pct_change()
f[f'ret_mean_{window}'] = rets.rolling(window).mean()
f[f'ret_std_{window}'] = rets.rolling(window).std()
f[f'ret_skew_{window}'] = rets.rolling(window).skew()
f[f'ret_kurt_{window}'] = rets.rolling(window).kurt()
# Price position within range
for window in [24, 72, 168]:
rolling_high = df['high'].rolling(window).max()
rolling_low = df['low'].rolling(window).min()
f[f'price_position_{window}'] = (
(df['close'] - rolling_low) / (rolling_high - rolling_low + 1e-8)
)
# Distance from moving averages
for ma_period in [9, 21, 50, 100, 200]:
ma = df['close'].ewm(span=ma_period).mean()
f[f'dist_ema_{ma_period}'] = (df['close'] - ma) / ma
return f
Volume features:
def create_volume_features(df):
f = pd.DataFrame(index=df.index)
# Volume ratios
for window in [6, 12, 24, 72]:
f[f'vol_ratio_{window}'] = df['volume'] / df['volume'].rolling(window).mean()
# Volume-price relationship
f['vwap_distance'] = (df['close'] -
(df['close'] * df['volume']).rolling(24).sum() / df['volume'].rolling(24).sum()
) / df['close']
# On-balance volume (OBV)
f['obv'] = talib.OBV(df['close'], df['volume'])
f['obv_slope'] = f['obv'].diff(12) / 12
# Volume weighted ATR
f['atr_14'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)
f['atr_ratio'] = f['atr_14'] / df['close']
# Money Flow Index
f['mfi_14'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
return f
Technical indicators:
def create_technical_features(df):
f = pd.DataFrame(index=df.index)
# Momentum oscillators
for period in [9, 14, 21]:
f[f'rsi_{period}'] = talib.RSI(df['close'], timeperiod=period) / 100
# MACD family
for fast, slow, signal in [(12, 26, 9), (5, 13, 5), (24, 52, 18)]:
macd, sig, hist = talib.MACD(df['close'], fast, slow, signal)
f[f'macd_hist_{fast}_{slow}'] = hist / df['close'] # нормализуем
# Bollinger Bands
for window, std in [(20, 2), (20, 1), (50, 2)]:
upper, mid, lower = talib.BBANDS(df['close'], window, std, std)
f[f'bb_width_{window}_{std}'] = (upper - lower) / mid
f[f'bb_pos_{window}_{std}'] = (df['close'] - lower) / (upper - lower + 1e-8)
# ADX (trend strength)
f['adx_14'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14) / 100
f['adx_trend'] = (f['adx_14'] > 0.25).astype(float)
# Stochastic
slowk, slowd = talib.STOCH(df['high'], df['low'], df['close'])
f['stoch_k'] = slowk / 100
f['stoch_d'] = slowd / 100
return f
Market microstructure features (требуют доступа к bid/ask данным):
def create_microstructure_features(df_ticks):
f = pd.DataFrame(index=df_ticks.index)
# Bid-ask spread
f['spread'] = (df_ticks['ask'] - df_ticks['bid']) / df_ticks['mid']
f['spread_ma'] = f['spread'].rolling(100).mean()
f['spread_relative'] = f['spread'] / f['spread_ma']
# Order flow imbalance
f['buy_volume'] = df_ticks['buy_volume']
f['sell_volume'] = df_ticks['sell_volume']
f['ofi'] = (f['buy_volume'] - f['sell_volume']) / (f['buy_volume'] + f['sell_volume'] + 1e-8)
f['ofi_ma'] = f['ofi'].rolling(20).mean()
return f
Cross-asset features
def create_cross_asset_features(prices_dict, target_symbol='BTC'):
f = pd.DataFrame(index=prices_dict[target_symbol].index)
# Returns корреляционных активов
for symbol in ['ETH', 'BNB', 'SP500', 'GOLD', 'DXY']:
if symbol in prices_dict:
rets = prices_dict[symbol].pct_change()
for lag in [1, 4, 24]:
f[f'{symbol}_ret_lag_{lag}'] = rets.shift(lag)
# Rolling correlation с target
target_rets = prices_dict[target_symbol].pct_change()
f[f'corr_{symbol}_30d'] = target_rets.rolling(720).corr(rets) # 720h = 30d
return f
Feature validation
Перед обучением модели валидируем features:
class FeatureValidator:
def validate(self, features_df, target_series):
report = {}
for col in features_df.columns:
series = features_df[col]
# 1. Проверка на look-ahead bias (features не должны использовать будущее)
# Это нужно проверять логически при создании features
# 2. Missing values
missing_pct = series.isna().mean()
# 3. Correlation с target
valid_mask = series.notna() & target_series.notna()
if valid_mask.sum() > 100:
corr = series[valid_mask].corr(target_series[valid_mask])
else:
corr = 0
# 4. Стационарность (ADF test)
from statsmodels.tsa.stattools import adfuller
try:
adf_stat, adf_p = adfuller(series.dropna())[:2]
stationary = adf_p < 0.05
except:
stationary = None
# 5. Variance (не константная)
variance = series.var()
report[col] = {
'missing_pct': missing_pct,
'correlation_with_target': corr,
'stationary': stationary,
'variance': variance,
'recommended': (missing_pct < 0.05 and abs(corr) > 0.01
and variance > 1e-8)
}
return pd.DataFrame(report).T
Feature selection
После создания 200+ features нужен отбор значимых:
Mutual Information: нелинейная зависимость между feature и target:
from sklearn.feature_selection import mutual_info_classif
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_df = pd.DataFrame({'feature': X_train.columns, 'mi_score': mi_scores})
top_features = mi_df.nlargest(50, 'mi_score')['feature'].tolist()
SHAP importance: после обучения baseline модели — SHAP values показывают реальный вклад:
import shap
explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_val)
feature_importance = pd.Series(
np.abs(shap_values).mean(0),
index=X_val.columns
).sort_values(ascending=False)
Correlation filtering: удаляем features с попарной корреляцией > 0.95.
Feature store архитектура
Для production: централизованный Feature Store с версионированием:
Raw data sources
→ Feature computation pipelines (scheduled)
→ Feature Store (Feast / Hopsworks / custom PostgreSQL)
→ Online store (Redis) → realtime serving
→ Offline store (Parquet/S3) → batch training
Разрабатываем Feature Engineering систему с 100+ автоматически вычисляемыми признаками, валидацией на look-ahead bias, feature selection, Feature Store для централизованного хранения и versioning.







