diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 9cd0d77..7d277ef 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -332,19 +332,39 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): async def warm_start_training(tickers, con): start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - + df_train = pd.DataFrame() + df_test = pd.DataFrame() + test_size = 0.2 + dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) - - df_train = pd.concat(dfs, ignore_index=True) - df_train = df_train.sample(frac=1).reset_index(drop=True) + + train_list = [] + test_list = [] + + for df in dfs: + try: + split_size = int(len(df) * (1 - test_size)) + train_data = df.iloc[:split_size] + test_data = df.iloc[split_size:] + + # Append to the lists + train_list.append(train_data) + test_list.append(test_data) + except: + pass + + # Concatenate all at once outside the loop + df_train = pd.concat(train_list, ignore_index=True) + df_test = pd.concat(test_list, ignore_index=True) print('======Warm Start Train Set Datapoints======') + df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True) print(len(df_train)) predictor = ScorePredictor() selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] predictor.warm_start_training(df_train[selected_features], df_train['Target']) - predictor.evaluate_model(df_train[selected_features], df_train['Target']) + predictor.evaluate_model(df_test[selected_features], df_test['Target']) return predictor @@ -369,7 +389,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date): data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) if len(data) != 0: - if data['precision'] >= 50 and data['accuracy'] >= 50: + if data['precision'] >= 60 and data['accuracy'] >= 60 and data['accuracy'] < 100 and data['precision'] < 100: res = {'score': data['score']} await save_json(ticker, res) print(f"Saved results for {ticker}") @@ -389,23 +409,23 @@ async def run(): if train_mode: # Warm start training - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") warm_start_symbols = [row[0] for row in cursor.fetchall()] print('Warm Start Training for:', warm_start_symbols) predictor = await warm_start_training(warm_start_symbols, con) else: # Fine-tuning and evaluation for all stocks cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] + stock_symbols = ['GME'] #[row[0] for row in cursor.fetchall()] print(f"Total tickers for fine-tuning: {len(stock_symbols)}") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") tasks = [] for ticker in tqdm(stock_symbols): - tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date)) + await fine_tune_and_evaluate(ticker, con, start_date, end_date) - await asyncio.gather(*tasks) + #await asyncio.gather(*tasks) con.close() diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 17b8cbc..170f05e 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 2e9d1bf..3318a4b 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -5,18 +5,7 @@ from sklearn.ensemble import RandomForestClassifier import numpy as np from xgboost import XGBClassifier from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MinMaxScaler, StandardScaler -from keras.models import Sequential, Model -from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional -from keras.optimizers import Adam -from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau -from keras.models import load_model -from sklearn.feature_selection import SelectKBest, f_classif -from tensorflow.keras.backend import clear_session -from keras import regularizers -from keras.layers import Layer -from tensorflow.keras import backend as K +from sklearn.preprocessing import MinMaxScaler from tqdm import tqdm from collections import defaultdict @@ -26,62 +15,11 @@ import aiofiles import pickle import time -class SelfAttention(Layer): - def __init__(self, **kwargs): - super(SelfAttention, self).__init__(**kwargs) - - def build(self, input_shape): - self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), - initializer='random_normal', trainable=True) - super(SelfAttention, self).build(input_shape) - - def call(self, x): - # Alignment scores. Pass them through tanh function - e = K.tanh(K.dot(x, self.W)) - # Remove dimension of size 1 - e = K.squeeze(e, axis=-1) - # Compute the weights - alpha = K.softmax(e) - # Reshape to tensor of same shape as x for multiplication - alpha = K.expand_dims(alpha, axis=-1) - # Compute the context vector - context = x * alpha - context = K.sum(context, axis=1) - return context, alpha - - def compute_output_shape(self, input_shape): - return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1]) - - class ScorePredictor: def __init__(self): self.scaler = MinMaxScaler() - self.model = None - self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras' - - def build_model(self): - clear_session() - - inputs = Input(shape=(231,)) - - x = Dense(128, activation='leaky_relu')(inputs) - x = BatchNormalization()(x) - x = Dropout(0.2)(x) - - for units in [64,32,16]: - x = Dense(units, activation='leaky_relu')(x) - x = BatchNormalization()(x) - x = Dropout(0.2)(x) - - x = Reshape((16, 1))(x) - x, _ = SelfAttention()(x) - outputs = Dense(2, activation='softmax')(x) - - model = Model(inputs=inputs, outputs=outputs) - optimizer = Adam(learning_rate=0.01, clipnorm=1.0) - model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) - - return model + self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' + self.model = XGBClassifier(n_estimators=100, max_depth = 10, min_samples_split=5, random_state=42, n_jobs=10) def preprocess_data(self, X): X = np.where(np.isinf(X), np.nan, X) @@ -91,37 +29,24 @@ class ScorePredictor: def warm_start_training(self, X_train, y_train): X_train = self.preprocess_data(X_train) - self.model = self.build_model() - - checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=30, min_lr=0.001) - - self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) - self.model.save(self.warm_start_model_path) + self.model.fit(X_train, y_train) + pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb')) print("Warm start model saved.") def fine_tune_model(self, X_train, y_train): X_train = self.preprocess_data(X_train) - #batch_size = min(64, max(16, len(X_train) // 10)) - if self.model is None: - self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention}) - - #early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) - #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.01) + with open(f'{self.warm_start_model_path}', 'rb') as f: + self.model = pickle.load(f) - self.model.fit(X_train, y_train, epochs=150, batch_size=16, validation_split=0.1) + self.model.fit(X_train, y_train) print("Model fine-tuned") def evaluate_model(self, X_test, y_test): X_test = self.preprocess_data(X_test) - if self.model is None: - raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.") - - test_predictions = self.model.predict(X_test) + test_predictions = self.model.predict_proba(X_test) class_1_probabilities = test_predictions[:, 1] binary_predictions = (class_1_probabilities >= 0.5).astype(int) #print(test_predictions) @@ -146,15 +71,4 @@ class ScorePredictor: return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), - 'score': score} - - def feature_selection(self, X_train, y_train, k=100): - print('Feature selection:') - print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") - selector = SelectKBest(score_func=f_classif, k=k) - selector.fit(X_train, y_train) - - selector.transform(X_train) - selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]] - - return selected_features \ No newline at end of file + 'score': score} \ No newline at end of file