update model back to lgm classifier
This commit is contained in:
parent
0bdc818d6b
commit
3f99870301
@ -190,8 +190,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading, sav
|
|||||||
|
|
||||||
# Compute combinations for each group of columns
|
# Compute combinations for each group of columns
|
||||||
#compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
#compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
||||||
compute_column_ratios(stats_columns, df_combined, new_columns)
|
#compute_column_ratios(stats_columns, df_combined, new_columns)
|
||||||
compute_column_ratios(ta_columns, df_combined, new_columns)
|
#compute_column_ratios(ta_columns, df_combined, new_columns)
|
||||||
|
|
||||||
# Concatenate the new ratio columns with the original DataFrame
|
# Concatenate the new ratio columns with the original DataFrame
|
||||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
||||||
@ -242,7 +242,7 @@ async def warm_start_training(tickers, con, skip_downloading, save_data):
|
|||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
test_size = 0.2
|
test_size = 0.2
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, start_date, end_date, skip_downloading, save_data, chunk_size=300)
|
dfs = await chunked_gather(tickers, con, start_date, end_date, skip_downloading, save_data, chunk_size=100)
|
||||||
|
|
||||||
train_list = []
|
train_list = []
|
||||||
test_list = []
|
test_list = []
|
||||||
@ -307,6 +307,9 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloa
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {ticker}: {e}")
|
print(f"Error processing {ticker}: {e}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
gc.collect() # Force the garbage collector to release unreferenced memory
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
train_mode = False # Set this to False for fine-tuning and evaluation
|
train_mode = False # Set this to False for fine-tuning and evaluation
|
||||||
skip_downloading = False
|
skip_downloading = False
|
||||||
@ -318,9 +321,10 @@ async def run():
|
|||||||
|
|
||||||
if train_mode:
|
if train_mode:
|
||||||
# Warm start training
|
# Warm start training
|
||||||
warm_start_symbols = list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
|
stock_symbols = cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'") #list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
|
||||||
print('Warm Start Training for:', warm_start_symbols)
|
stock_symbols = [row[0] for row in cursor.fetchall()]
|
||||||
predictor = await warm_start_training(warm_start_symbols, con, skip_downloading, save_data)
|
print('Training for:', stock_symbols)
|
||||||
|
predictor = await warm_start_training(stock_symbols, con, skip_downloading, save_data)
|
||||||
else:
|
else:
|
||||||
# Fine-tuning and evaluation for all stocks
|
# Fine-tuning and evaluation for all stocks
|
||||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
|
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
|
||||||
|
|||||||
@ -1,25 +1,10 @@
|
|||||||
import yfinance as yf
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from sklearn.ensemble import RandomForestClassifier
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from xgboost import XGBClassifier
|
|
||||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
|
||||||
from keras.models import Sequential, Model
|
|
||||||
from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
|
|
||||||
from keras.optimizers import AdamW
|
|
||||||
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
|
|
||||||
from tensorflow.keras.activations import gelu
|
|
||||||
from keras.models import load_model
|
|
||||||
from sklearn.feature_selection import SelectKBest, f_classif
|
|
||||||
from tensorflow.keras.backend import clear_session
|
|
||||||
from keras import regularizers
|
|
||||||
from keras.layers import Layer
|
|
||||||
from tensorflow.keras import backend as K
|
|
||||||
import tensorflow as tf
|
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
|
import lightgbm as lgb
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@ -28,85 +13,39 @@ import aiohttp
|
|||||||
import aiofiles
|
import aiofiles
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
class SelfAttention(Layer):
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
super(SelfAttention, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
def build(self, input_shape):
|
|
||||||
self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
|
|
||||||
initializer='random_normal', trainable=True)
|
|
||||||
super(SelfAttention, self).build(input_shape)
|
|
||||||
|
|
||||||
def call(self, x):
|
|
||||||
# Alignment scores. Pass them through tanh function
|
|
||||||
e = K.tanh(K.dot(x, self.W))
|
|
||||||
# Remove dimension of size 1
|
|
||||||
e = K.squeeze(e, axis=-1)
|
|
||||||
# Compute the weights
|
|
||||||
alpha = K.softmax(e)
|
|
||||||
# Reshape to tensor of same shape as x for multiplication
|
|
||||||
alpha = K.expand_dims(alpha, axis=-1)
|
|
||||||
# Compute the context vector
|
|
||||||
context = x * alpha
|
|
||||||
context = K.sum(context, axis=1)
|
|
||||||
return context, alpha
|
|
||||||
|
|
||||||
def compute_output_shape(self, input_shape):
|
|
||||||
return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])
|
|
||||||
|
|
||||||
|
|
||||||
class ScorePredictor:
|
class ScorePredictor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scaler = MinMaxScaler()
|
self.scaler = MinMaxScaler()
|
||||||
self.model = None
|
self.model = lgb.LGBMClassifier(
|
||||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras'
|
n_estimators=1_000,
|
||||||
self.pca = PCA(n_components=3)
|
learning_rate=0.001,
|
||||||
def build_model(self):
|
max_depth=10,
|
||||||
clear_session()
|
num_leaves=2**10-1,
|
||||||
|
n_jobs=10
|
||||||
inputs = Input(shape=(3,))
|
)
|
||||||
x = Dense(512, activation=gelu)(inputs) # Using GELU activation
|
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
|
||||||
x = Dropout(0.5)(x)
|
#self.pca = PCA(n_components=3)
|
||||||
x = BatchNormalization()(x)
|
|
||||||
|
|
||||||
for units in [64, 32]:
|
|
||||||
x = Dense(units, activation=gelu)(x) # Using GELU activation
|
|
||||||
x = Dropout(0.2)(x)
|
|
||||||
x = BatchNormalization()(x)
|
|
||||||
|
|
||||||
x = Reshape((32, 1))(x)
|
|
||||||
x, _ = SelfAttention()(x)
|
|
||||||
outputs = Dense(2, activation='softmax')(x)
|
|
||||||
|
|
||||||
model = Model(inputs=inputs, outputs=outputs)
|
|
||||||
optimizer = AdamW(learning_rate=0.001, weight_decay=0.01, clipnorm=1.0)
|
|
||||||
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
def preprocess_train_data(self, X):
|
def preprocess_train_data(self, X):
|
||||||
X = np.where(np.isinf(X), np.nan, X)
|
X = np.where(np.isinf(X), np.nan, X)
|
||||||
X = np.nan_to_num(X)
|
X = np.nan_to_num(X)
|
||||||
X = self.scaler.fit_transform(X)
|
X = self.scaler.fit_transform(X)
|
||||||
return self.pca.fit_transform(X)
|
return X #self.pca.fit_transform(X)
|
||||||
|
|
||||||
def preprocess_test_data(self, X):
|
def preprocess_test_data(self, X):
|
||||||
X = np.where(np.isinf(X), np.nan, X)
|
X = np.where(np.isinf(X), np.nan, X)
|
||||||
X = np.nan_to_num(X)
|
X = np.nan_to_num(X)
|
||||||
X = self.scaler.fit_transform(X)
|
X = self.scaler.fit_transform(X)
|
||||||
return self.pca.fit_transform(X)
|
return X #self.pca.fit_transform(X)
|
||||||
|
|
||||||
def warm_start_training(self, X_train, y_train):
|
def warm_start_training(self, X_train, y_train):
|
||||||
X_train = self.preprocess_train_data(X_train)
|
X_train = self.preprocess_train_data(X_train)
|
||||||
self.model = self.build_model()
|
|
||||||
|
|
||||||
checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min')
|
self.model.fit(X_train, y_train)
|
||||||
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
|
pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
|
||||||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.001)
|
|
||||||
|
|
||||||
self.model.fit(X_train, y_train, epochs=100_000, batch_size=256, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
|
|
||||||
self.model.save(self.warm_start_model_path)
|
|
||||||
print("Warm start model saved.")
|
print("Warm start model saved.")
|
||||||
|
|
||||||
def fine_tune_model(self, X_train, y_train):
|
def fine_tune_model(self, X_train, y_train):
|
||||||
@ -124,10 +63,10 @@ class ScorePredictor:
|
|||||||
def evaluate_model(self, X_test, y_test):
|
def evaluate_model(self, X_test, y_test):
|
||||||
X_test = self.preprocess_test_data(X_test)
|
X_test = self.preprocess_test_data(X_test)
|
||||||
|
|
||||||
with tf.device('/CPU:0'):
|
with open(self.warm_start_model_path, 'rb') as f:
|
||||||
# Load model and make predictions
|
self.model = pickle.load(f)
|
||||||
self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention})
|
|
||||||
test_predictions = self.model.predict(X_test)
|
test_predictions = self.model.predict_proba(X_test)
|
||||||
class_1_probabilities = test_predictions[:, 1]
|
class_1_probabilities = test_predictions[:, 1]
|
||||||
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user