205 lines
9.7 KiB
Python
205 lines
9.7 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from scipy import stats
|
|
from sklearn.preprocessing import RobustScaler
|
|
from ta.momentum import *
|
|
from ta.trend import *
|
|
from ta.volatility import *
|
|
from ta.volume import *
|
|
|
|
|
|
|
|
def trend_intensity(close, window=20):
|
|
ma = close.rolling(window=window).mean()
|
|
std = close.rolling(window=window).std()
|
|
return ((close - ma) / std).abs().rolling(window=window).mean()
|
|
|
|
|
|
def calculate_fdi(high, low, close, window=30):
|
|
n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
|
|
np.log(close.rolling(window=window).max() - close.rolling(window=window).min())) / np.log(2)
|
|
return (2 - n1) * 100
|
|
|
|
|
|
def generate_ta_features(df):
|
|
|
|
df_features = df.copy()
|
|
|
|
df_features['sma_50'] = df['close'].rolling(window=50).mean()
|
|
df_features['sma_200'] = df['close'].rolling(window=200).mean()
|
|
df_features['sma_crossover'] = ((df_features['sma_50'] > df_features['sma_200']) & (df_features['sma_50'].shift(1) <= df_features['sma_200'].shift(1))).astype(int)
|
|
|
|
df_features['ema_50'] = EMAIndicator(close=df['close'], window=50).ema_indicator()
|
|
df_features['ema_200'] = EMAIndicator(close=df['close'], window=200).ema_indicator()
|
|
df_features['ema_crossover'] = ((df_features['ema_50'] > df_features['ema_200']) & (df_features['ema_50'].shift(1) <= df_features['ema_200'].shift(1))).astype(int)
|
|
|
|
df_features['wma'] = WMAIndicator(df['close'], window = 30).wma()
|
|
|
|
ichimoku = IchimokuIndicator(high=df['high'], low=df['low'])
|
|
df_features['ichimoku_a'] = ichimoku.ichimoku_a()
|
|
df_features['ichimoku_b'] = ichimoku.ichimoku_b()
|
|
df_features['atr'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close']).average_true_range()
|
|
bb = BollingerBands(close=df['close'])
|
|
df_features['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / df['close']
|
|
|
|
|
|
df_features['macd'] = macd(df['close'])
|
|
df_features['macd_signal'] = macd_signal(df['close'])
|
|
df_features['macd_hist'] = 2*macd_diff(df['close'])
|
|
df_features['adx'] = adx(df['high'],df['low'],df['close'])
|
|
df_features["adx_pos"] = adx_pos(df['high'],df['low'],df['close'])
|
|
df_features["adx_neg"] = adx_neg(df['high'],df['low'],df['close'])
|
|
df_features['cci'] = CCIIndicator(high=df['high'], low=df['low'], close=df['close']).cci()
|
|
df_features['mfi'] = MFIIndicator(high=df['high'], low=df['low'], close=df['close'], volume=df['volume']).money_flow_index()
|
|
|
|
df_features['nvi'] = NegativeVolumeIndexIndicator(close=df['close'], volume=df['volume']).negative_volume_index()
|
|
df_features['obv'] = OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
|
|
df_features['vpt'] = VolumePriceTrendIndicator(close=df['close'], volume=df['volume']).volume_price_trend()
|
|
|
|
df_features['rsi'] = rsi(df["close"], window=60)
|
|
df_features['rolling_rsi'] = df_features['rsi'].rolling(window=10).mean()
|
|
df_features['stoch_rsi'] = stochrsi_k(df['close'], window=60, smooth1=3, smooth2=3)
|
|
df_features['rolling_stoch_rsi'] = df_features['stoch_rsi'].rolling(window=10).mean()
|
|
|
|
df_features['adi'] = acc_dist_index(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'])
|
|
df_features['cmf'] = chaikin_money_flow(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'], window=20)
|
|
df_features['emv'] = ease_of_movement(high=df['high'],low=df['low'],volume=df['volume'], window=20)
|
|
df_features['fi'] = force_index(close=df['close'], volume=df['volume'], window= 13)
|
|
|
|
df_features['williams'] = WilliamsRIndicator(high=df['high'], low=df['low'], close=df['close']).williams_r()
|
|
df_features['kama'] = KAMAIndicator(close=df['close']).kama()
|
|
|
|
stoch = StochasticOscillator(high=df['high'], low=df['low'], close=df['close'], window=60, smooth_window=3)
|
|
df_features['stoch_k'] = stoch.stoch()
|
|
df_features['stoch_d'] = stoch.stoch_signal()
|
|
|
|
df_features['rocr'] = df['close'] / df['close'].shift(30) - 1 # Rate of Change Ratio (ROCR)
|
|
df_features['ppo'] = (df_features['ema_50'] - df_features['ema_200']) / df_features['ema_50'] * 100
|
|
df_features['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
|
|
df_features['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
|
|
|
|
df_features['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
|
|
df_features['tii'] = trend_intensity(df['close'])
|
|
|
|
df_features['fft'] = np.abs(np.fft.fft(df['close']))
|
|
don_channel = DonchianChannel(high=df['high'], low=df['low'],close=df['close'], window=60)
|
|
df_features['don_hband'] = don_channel.donchian_channel_hband()
|
|
df_features['don_lband'] = don_channel.donchian_channel_lband()
|
|
df_features['don_mband'] = don_channel.donchian_channel_mband()
|
|
df_features['don_pband'] = don_channel.donchian_channel_pband()
|
|
df_features['don_wband'] = don_channel.donchian_channel_wband()
|
|
|
|
aroon = AroonIndicator(high=df['high'], low=df['low'], window=60)
|
|
df_features['aroon_down'] = aroon.aroon_down()
|
|
df_features['aroon_indicator'] = aroon.aroon_indicator()
|
|
df_features['aroon_up'] = aroon.aroon_up()
|
|
|
|
df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
|
|
df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
|
|
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
|
|
df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
|
df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
|
|
|
df_features = df_features.dropna()
|
|
return df_features
|
|
|
|
def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
|
high_col='high', low_col='low', volume_col='volume'):
|
|
"""
|
|
Generate comprehensive statistical features for financial time series data.
|
|
|
|
Parameters:
|
|
-----------
|
|
df : pandas.DataFrame
|
|
DataFrame containing the price and volume data
|
|
windows : list
|
|
List of rolling window sizes to use for feature generation
|
|
price_col : str
|
|
Name of the closing price column
|
|
high_col : str
|
|
Name of the high price column
|
|
low_col : str
|
|
Name of the low price column
|
|
volume_col : str
|
|
Name of the volume column
|
|
|
|
Returns:
|
|
--------
|
|
pandas.DataFrame
|
|
DataFrame with additional statistical features
|
|
"""
|
|
|
|
# Create a copy of the dataframe to avoid modifying the original
|
|
df_features = df.copy()
|
|
|
|
|
|
# Calculate features for each window size
|
|
for window in windows:
|
|
# Returns
|
|
df_features[f'returns_{window}'] = df[price_col].pct_change(periods=window)
|
|
|
|
# Log returns and statistics
|
|
log_returns = np.log(df[price_col]/df[price_col].shift(1))
|
|
df_features[f'log_returns_{window}'] = log_returns.rolling(window=window).mean()
|
|
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
|
|
|
|
# Statistical moments
|
|
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
|
|
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
|
|
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
|
|
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
|
|
|
|
# Volatility measures
|
|
df_features[f'realized_vol_{window}'] = (
|
|
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
|
|
df_features[f'range_vol_{window}'] = (
|
|
(df[high_col].rolling(window=window).max() -
|
|
df[low_col].rolling(window=window).min()) / df[price_col])
|
|
|
|
# Z-scores and normalized prices
|
|
df_features[f'zscore_{window}'] = (
|
|
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
|
df[price_col].rolling(window=window).std())
|
|
df_features[f'norm_price_{window}'] = (
|
|
df[price_col] / df[price_col].rolling(window=window).mean() - 1)
|
|
|
|
|
|
# Correlation features
|
|
if volume_col in df.columns:
|
|
df_features[f'volume_price_corr_{window}'] = (
|
|
df[price_col].rolling(window=window).corr(df[volume_col]))
|
|
df_features[f'high_low_corr_{window}'] = (
|
|
df[high_col].rolling(window=window).corr(df[low_col]))
|
|
|
|
|
|
|
|
# Quantile features
|
|
for q in [0.25, 0.75]:
|
|
df_features[f'price_q{int(q*100)}_{window}'] = (
|
|
df[price_col].rolling(window=window).quantile(q))
|
|
|
|
# Price dynamics
|
|
df_features['price_acceleration'] = df[price_col].diff().diff()
|
|
df_features['momentum_change'] = df[price_col].pct_change().diff()
|
|
|
|
# Advanced volatility
|
|
df_features['parkinson_vol'] = np.sqrt(
|
|
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
|
|
|
|
# Efficiency ratio
|
|
df_features['price_efficiency'] = (
|
|
abs(df[price_col] - df[price_col].shift(20)) /
|
|
(df[high_col].rolling(20).max() - df[low_col].rolling(20).min())
|
|
)
|
|
|
|
# Deviation metrics
|
|
df_features['deviation_from_vwap'] = (
|
|
(df[price_col] - df[price_col].rolling(window=20).mean()) /
|
|
df[price_col].rolling(window=20).mean()
|
|
)
|
|
|
|
df_features['stock_return'] = df['close'].pct_change()
|
|
|
|
df_features = df_features.dropna()
|
|
return df_features
|