switch from nn to xgb

This commit is contained in:
MuslemRahimi 2024-10-02 16:23:29 +02:00
parent 699594cb64
commit 75b9cdc2b1
3 changed files with 40 additions and 106 deletions

View File

@ -332,19 +332,39 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
async def warm_start_training(tickers, con):
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d")
df_train = pd.DataFrame()
df_test = pd.DataFrame()
test_size = 0.2
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
df_train = pd.concat(dfs, ignore_index=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
train_list = []
test_list = []
for df in dfs:
try:
split_size = int(len(df) * (1 - test_size))
train_data = df.iloc[:split_size]
test_data = df.iloc[split_size:]
# Append to the lists
train_list.append(train_data)
test_list.append(test_data)
except:
pass
# Concatenate all at once outside the loop
df_train = pd.concat(train_list, ignore_index=True)
df_test = pd.concat(test_list, ignore_index=True)
print('======Warm Start Train Set Datapoints======')
df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
print(len(df_train))
predictor = ScorePredictor()
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
predictor.evaluate_model(df_train[selected_features], df_train['Target'])
predictor.evaluate_model(df_test[selected_features], df_test['Target'])
return predictor
@ -369,7 +389,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
if len(data) != 0:
if data['precision'] >= 50 and data['accuracy'] >= 50:
if data['precision'] >= 60 and data['accuracy'] >= 60 and data['accuracy'] < 100 and data['precision'] < 100:
res = {'score': data['score']}
await save_json(ticker, res)
print(f"Saved results for {ticker}")
@ -389,23 +409,23 @@ async def run():
if train_mode:
# Warm start training
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
warm_start_symbols = [row[0] for row in cursor.fetchall()]
print('Warm Start Training for:', warm_start_symbols)
predictor = await warm_start_training(warm_start_symbols, con)
else:
# Fine-tuning and evaluation for all stocks
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in cursor.fetchall()]
stock_symbols = ['GME'] #[row[0] for row in cursor.fetchall()]
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d")
tasks = []
for ticker in tqdm(stock_symbols):
tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date))
await fine_tune_and_evaluate(ticker, con, start_date, end_date)
await asyncio.gather(*tasks)
#await asyncio.gather(*tasks)
con.close()

View File

@ -5,18 +5,7 @@ from sklearn.ensemble import RandomForestClassifier
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential, Model
from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model
from sklearn.feature_selection import SelectKBest, f_classif
from tensorflow.keras.backend import clear_session
from keras import regularizers
from keras.layers import Layer
from tensorflow.keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from collections import defaultdict
@ -26,62 +15,11 @@ import aiofiles
import pickle
import time
class SelfAttention(Layer):
def __init__(self, **kwargs):
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
initializer='random_normal', trainable=True)
super(SelfAttention, self).build(input_shape)
def call(self, x):
# Alignment scores. Pass them through tanh function
e = K.tanh(K.dot(x, self.W))
# Remove dimension of size 1
e = K.squeeze(e, axis=-1)
# Compute the weights
alpha = K.softmax(e)
# Reshape to tensor of same shape as x for multiplication
alpha = K.expand_dims(alpha, axis=-1)
# Compute the context vector
context = x * alpha
context = K.sum(context, axis=1)
return context, alpha
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])
class ScorePredictor:
def __init__(self):
self.scaler = MinMaxScaler()
self.model = None
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras'
def build_model(self):
clear_session()
inputs = Input(shape=(231,))
x = Dense(128, activation='leaky_relu')(inputs)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
for units in [64,32,16]:
x = Dense(units, activation='leaky_relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Reshape((16, 1))(x)
x, _ = SelfAttention()(x)
outputs = Dense(2, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
return model
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
self.model = XGBClassifier(n_estimators=100, max_depth = 10, min_samples_split=5, random_state=42, n_jobs=10)
def preprocess_data(self, X):
X = np.where(np.isinf(X), np.nan, X)
@ -91,37 +29,24 @@ class ScorePredictor:
def warm_start_training(self, X_train, y_train):
X_train = self.preprocess_data(X_train)
self.model = self.build_model()
checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=30, min_lr=0.001)
self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
self.model.save(self.warm_start_model_path)
self.model.fit(X_train, y_train)
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
print("Warm start model saved.")
def fine_tune_model(self, X_train, y_train):
X_train = self.preprocess_data(X_train)
#batch_size = min(64, max(16, len(X_train) // 10))
if self.model is None:
self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention})
with open(f'{self.warm_start_model_path}', 'rb') as f:
self.model = pickle.load(f)
#early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.01)
self.model.fit(X_train, y_train, epochs=150, batch_size=16, validation_split=0.1)
self.model.fit(X_train, y_train)
print("Model fine-tuned")
def evaluate_model(self, X_test, y_test):
X_test = self.preprocess_data(X_test)
if self.model is None:
raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.")
test_predictions = self.model.predict(X_test)
test_predictions = self.model.predict_proba(X_test)
class_1_probabilities = test_predictions[:, 1]
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
#print(test_predictions)
@ -147,14 +72,3 @@ class ScorePredictor:
return {'accuracy': round(test_accuracy * 100),
'precision': round(test_precision * 100),
'score': score}
def feature_selection(self, X_train, y_train, k=100):
print('Feature selection:')
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_train, y_train)
selector.transform(X_train)
selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
return selected_features