219 lines
8.9 KiB
Python
219 lines
8.9 KiB
Python
import yfinance as yf
|
|
import pandas as pd
|
|
from datetime import datetime, timedelta
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from xgboost import XGBClassifier
|
|
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from ta.utils import *
|
|
from ta.volatility import *
|
|
from ta.momentum import *
|
|
from ta.trend import *
|
|
from ta.volume import *
|
|
from tqdm import tqdm
|
|
from sklearn.feature_selection import SelectKBest, f_classif
|
|
from keras.models import Sequential
|
|
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
|
|
from keras.optimizers import Adam
|
|
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
|
|
from keras.regularizers import l2
|
|
|
|
class StockPredictor:
|
|
def __init__(self, ticker, start_date, end_date):
|
|
self.ticker = ticker
|
|
self.start_date = start_date
|
|
self.end_date = end_date
|
|
self.nth_day = 60
|
|
self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic')
|
|
self.horizons = [3,5,10, 15, 20]
|
|
self.test_size = 0.2
|
|
|
|
def download_data(self):
|
|
df_original = yf.download(self.ticker, start=self.start_date, end=self.end_date, interval="1d")
|
|
df_original.index = pd.to_datetime(df_original.index)
|
|
return df_original
|
|
|
|
def preprocess_data(self, df):
|
|
df['Target'] = (df['Close'].shift(-self.nth_day) > df['Close']).astype(int)
|
|
df.dropna(inplace=True)
|
|
return df
|
|
|
|
|
|
def generate_features(self, df):
|
|
new_predictors = []
|
|
for horizon in self.horizons:
|
|
rolling_averages = df.rolling(horizon).mean()
|
|
|
|
ratio_column = f"Close_Ratio_{horizon}"
|
|
df[ratio_column] = df["Close"] / rolling_averages["Close"]
|
|
new_predictors.append(ratio_column)
|
|
|
|
trend_column = f"Trend_{horizon}"
|
|
df[trend_column] = df["Close"].pct_change(periods=horizon)
|
|
new_predictors.append(trend_column)
|
|
|
|
volatility_column = f"Volatility_{horizon}"
|
|
df[volatility_column] = df["Close"].pct_change().rolling(horizon).std()
|
|
new_predictors.append(volatility_column)
|
|
|
|
volatility_mean_column = f"Volatility_Mean_{horizon}"
|
|
df[volatility_mean_column] = df["Close"].pct_change().rolling(horizon).mean()
|
|
new_predictors.append(volatility_mean_column)
|
|
|
|
sma_column = f"SMA_{horizon}"
|
|
df[sma_column] = sma_indicator(df['Close'], window=horizon)
|
|
|
|
ema_column = f"EMA_{horizon}"
|
|
df[ema_column] = ema_indicator(df['Close'], window=horizon)
|
|
|
|
rsi_column = f"RSI_{horizon}"
|
|
df[rsi_column] = rsi(df["Close"], window=horizon)
|
|
new_predictors.append(rsi_column)
|
|
|
|
stoch_rsi_column = f"STOCH_RSI_{horizon}"
|
|
df[stoch_rsi_column] = stochrsi_k(df['Close'], window=horizon, smooth1=3, smooth2=3)
|
|
new_predictors.append(stoch_rsi_column)
|
|
|
|
stoch_column = f"STOCH_{horizon}"
|
|
df[stoch_column] = stoch(df['High'], df['Low'], df['Close'], window=horizon)
|
|
new_predictors.append(stoch_column)
|
|
|
|
roc_column = f"ROC_{horizon}"
|
|
df[roc_column] = roc(df['Close'], window=horizon)
|
|
new_predictors.append(roc_column)
|
|
|
|
wma_column = f"WMA_{horizon}"
|
|
df[wma_column] = wma_indicator(df['Close'], window=horizon)
|
|
new_predictors.append(wma_column)
|
|
|
|
# Additional features
|
|
atr_column = f"ATR_{horizon}"
|
|
df[atr_column] = average_true_range(df['High'], df['Low'], df['Close'], window=horizon)
|
|
new_predictors.append(atr_column)
|
|
|
|
|
|
adx_column = f"ADX_{horizon}"
|
|
df[adx_column] = adx(df['High'], df['Low'], df['Close'], window=horizon)
|
|
new_predictors.append(adx_column)
|
|
|
|
bb_bands_column = f"BB_{horizon}"
|
|
df[bb_bands_column] = bollinger_hband(df['Close'], window=horizon) / df['Close']
|
|
new_predictors.append(bb_bands_column)
|
|
|
|
|
|
df['macd'] = macd(df['Close'])
|
|
df['macd_signal'] = macd_signal(df['Close'])
|
|
df['macd_hist'] = 2*macd_diff(df['Close'])
|
|
new_predictors.append('macd')
|
|
new_predictors.append('macd_signal')
|
|
new_predictors.append('macd_hist')
|
|
return new_predictors
|
|
|
|
def feature_selection(self, df, predictors):
|
|
X = df[predictors]
|
|
y = df['Target']
|
|
|
|
selector = SelectKBest(score_func=f_classif, k=5)
|
|
selector.fit(X, y)
|
|
|
|
selector.transform(X)
|
|
selected_features = [col for i, col in enumerate(X.columns) if selector.get_support()[i]]
|
|
|
|
return selected_features
|
|
|
|
def build_lstm_model(self,input_shape):
|
|
model = Sequential()
|
|
model.add(Bidirectional(LSTM(units=1024, return_sequences=True, kernel_regularizer=l2(0.01)), input_shape=input_shape))
|
|
model.add(BatchNormalization())
|
|
model.add(Dropout(0.5))
|
|
model.add(Bidirectional(LSTM(units=128, return_sequences=True, kernel_regularizer=l2(0.01))))
|
|
model.add(BatchNormalization())
|
|
model.add(Dropout(0.25))
|
|
model.add(Bidirectional(LSTM(units=64, kernel_regularizer=l2(0.01))))
|
|
model.add(BatchNormalization())
|
|
model.add(Dropout(0.2))
|
|
model.add(Dense(units=1, activation='sigmoid'))
|
|
|
|
# Learning rate scheduler
|
|
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
|
|
# Early stopping
|
|
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
|
|
|
|
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
|
return model, [reduce_lr, early_stop]
|
|
|
|
|
|
def train_model(self, X_train, y_train):
|
|
self.model, callbacks = self.build_lstm_model((X_train.shape[1], X_train.shape[2]))
|
|
history = self.model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.1, callbacks=callbacks)
|
|
|
|
def evaluate_model(self, X_test, y_test):
|
|
# Reshape X_test to remove the extra dimension
|
|
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[2])
|
|
|
|
X_test_df = pd.DataFrame(X_test_reshaped, columns=predictors)
|
|
y_test_df = pd.DataFrame(y_test, columns=['Target'])
|
|
|
|
test_df = X_test_df.join(y_test_df)
|
|
test_df = test_df.iloc[int(len(test_df) * (1 - self.test_size)):]
|
|
|
|
# Implement the rest of your evaluation logic here
|
|
test_predictions = self.model.predict(X_test)
|
|
test_predictions = (test_predictions > 0.5).astype(int)
|
|
print(test_predictions)
|
|
# Assuming you have the model already defined and trained
|
|
# Perform evaluation metrics on test_predictions and y_test
|
|
|
|
test_precision = precision_score(y_test, test_predictions)
|
|
test_accuracy = accuracy_score(y_test, test_predictions)
|
|
test_recall = recall_score(y_test, test_predictions)
|
|
test_f1 = f1_score(y_test, test_predictions)
|
|
test_roc_auc = roc_auc_score(y_test, test_predictions)
|
|
|
|
print("Test Set Metrics:")
|
|
print(f"Precision: {round(test_precision * 100)}%")
|
|
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
|
print(f"Recall: {round(test_recall * 100)}%")
|
|
print(f"F1-Score: {round(test_f1 * 100)}%")
|
|
print(f"ROC-AUC: {round(test_roc_auc * 100)}%")
|
|
|
|
def predict_next_value(self, df, predictors):
|
|
latest_data_point = df.iloc[-1][predictors]
|
|
next_value_prediction = self.model.predict([latest_data_point])[0]
|
|
next_value_probability = self.model.predict_proba([latest_data_point])[0][1]
|
|
print("Predicted next value:", next_value_prediction)
|
|
print("Probability of predicted next value:", round(next_value_probability * 100, 2), "%")
|
|
latest_date_index = df.index[-1]
|
|
next_prediction_date = latest_date_index + pd.DateOffset(days=self.nth_day)
|
|
print("Corresponding date for the next prediction:", next_prediction_date)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ticker = 'AAPL'
|
|
start_date = datetime(2000, 1, 1)
|
|
end_date = datetime.today()
|
|
|
|
predictor = StockPredictor(ticker, start_date, end_date)
|
|
df = predictor.download_data()
|
|
|
|
predictors = predictor.generate_features(df)
|
|
df = predictor.preprocess_data(df)
|
|
|
|
X = df[predictors].values
|
|
y = df['Target'].values
|
|
|
|
# Normalize features
|
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
X = scaler.fit_transform(X)
|
|
|
|
# Reshape data for LSTM
|
|
X = X.reshape((X.shape[0], 1, X.shape[1]))
|
|
|
|
train_size = int(len(X) * (1 - predictor.test_size))
|
|
X_train, X_test = X[:train_size], X[train_size:]
|
|
y_train, y_test = y[:train_size], y[train_size:]
|
|
|
|
predictor.train_model(X_train, y_train)
|
|
predictor.evaluate_model(X_test, y_test)
|
|
predictor.predict_next_value(X[-1]) |