From 75b9cdc2b116847f6de9a7007399c8f5f77430e0 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Wed, 2 Oct 2024 16:23:29 +0200 Subject: [PATCH] switch from nn to xgb --- app/cron_ai_score.py | 40 +++++-- .../__pycache__/score_model.cpython-310.pyc | Bin 6697 -> 3079 bytes app/ml_models/score_model.py | 106 ++---------------- 3 files changed, 40 insertions(+), 106 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 9cd0d77..7d277ef 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -332,19 +332,39 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): async def warm_start_training(tickers, con): start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - + df_train = pd.DataFrame() + df_test = pd.DataFrame() + test_size = 0.2 + dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) - - df_train = pd.concat(dfs, ignore_index=True) - df_train = df_train.sample(frac=1).reset_index(drop=True) + + train_list = [] + test_list = [] + + for df in dfs: + try: + split_size = int(len(df) * (1 - test_size)) + train_data = df.iloc[:split_size] + test_data = df.iloc[split_size:] + + # Append to the lists + train_list.append(train_data) + test_list.append(test_data) + except: + pass + + # Concatenate all at once outside the loop + df_train = pd.concat(train_list, ignore_index=True) + df_test = pd.concat(test_list, ignore_index=True) print('======Warm Start Train Set Datapoints======') + df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True) print(len(df_train)) predictor = ScorePredictor() selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] predictor.warm_start_training(df_train[selected_features], df_train['Target']) - predictor.evaluate_model(df_train[selected_features], df_train['Target']) + predictor.evaluate_model(df_test[selected_features], df_test['Target']) return predictor @@ -369,7 +389,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date): data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) if len(data) != 0: - if data['precision'] >= 50 and data['accuracy'] >= 50: + if data['precision'] >= 60 and data['accuracy'] >= 60 and data['accuracy'] < 100 and data['precision'] < 100: res = {'score': data['score']} await save_json(ticker, res) print(f"Saved results for {ticker}") @@ -389,23 +409,23 @@ async def run(): if train_mode: # Warm start training - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") warm_start_symbols = [row[0] for row in cursor.fetchall()] print('Warm Start Training for:', warm_start_symbols) predictor = await warm_start_training(warm_start_symbols, con) else: # Fine-tuning and evaluation for all stocks cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] + stock_symbols = ['GME'] #[row[0] for row in cursor.fetchall()] print(f"Total tickers for fine-tuning: {len(stock_symbols)}") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") tasks = [] for ticker in tqdm(stock_symbols): - tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date)) + await fine_tune_and_evaluate(ticker, con, start_date, end_date) - await asyncio.gather(*tasks) + #await asyncio.gather(*tasks) con.close() diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 17b8cbc3b33325400efb1bc4b73354bc9bea3db4..170f05e04154dfac118e03c1b8ac01b23fe4eec2 100644 GIT binary patch delta 1780 zcmZux-D@0G6u)=Ac6N7m^WAJdS`(1k)izp9X&z#$s3>ScA!zG9I85fwHq*||Oz*vE zx?z`nSVd4obP$BLkoX|94_A-pP==e*~En64)@%%zjMES=j*}p z$F;GfQYjJmz4PwhO|wkMADL^|6azNrSsz zfgzT&wtc~Xl}`4Hy|_{U>@~zsgOH?8I>e=ng4CqWG^R6ynINstOu!6AU}s@{_k1=| zTc&RdTq)wF~f~fg` z@cHa(aG58)TK33!xu=5(kIy~13QC9Ukq+IbUGU@$x~Fw$N88sZ zk@}v2hy-umuy*Uyc_L2k06+ZR zhXMP{J?lTOJjtGK?vAculoy)4Fb>Yi?KohEyq|+NjtaZ%3*0rQ{pcQ0+=@@F54QY2QW=@hm1t(hf z5Ag!vg=6>|2L@T(E8#KBzNhU|JmflbfNyBmhPecD5w0)8GfIqf%W#3yDcvRa^URXv z?B}uRm9$gomLVea7CXA+=oy_dtK8Ee2dnD?3Z(yn0l(ibXW{sQ#z(SG$KR$GvR}uS zF7i<@42mc)JqFlG23Q1q3^h!f0oKI;TMu7G?J@ka)rrL`%JzJJn|pp+6``Q~P=raW zOo3HdcaEiZ1*}-3F4Wwnrz}s%b{Z%hlA7mtVdvpzv+pJrt9goBkmAsJ_{HqEiBB$L z&Z;7`i9l>65mV-EFWL^C%!F!yFf^ZGB4ECV7eJJDZ=i*_KnvE%lSiZbR~Uu^%X{-BeZz4bT((E!>-X3;m`> z1(dYMpi>9=+y0M^vR9$~Y)8Q*$cWQKEWnvaYLq(EhA|Jr(NI79#%&UjN2xN2Fvqbu25QCCfGvi;OB6i92;W9qygo9ddqF z_s&YluJrzIgJ{>>fJ`Gnm)uLLJR7|cf64t7Ri2C9jNf$M zROM6Am*cnGw^Uh=F1weN?A~_YR_`nB74`m#`xW)R>R$aoWAkj`p~e=1OS`7Kiuxiu zjr!?b!+nS8ceK{hm`-TDWk`qlS&)TsAWKA95M_Q#my5Ukgr)KODGx;UZsZFQcEW(8 zr}ps&*N-&Va;DFNb|}I$@kATrWfgTliY7Iub8%9t@wDyvgZ89a_uK6O_uJcf9jqG} z_rt`?@JUbfqcFqzs%v4g=0CUtjschTI~msHbC%c9U%nIE9Rx`h`jNEPQmnsKmiAYZ z{veai+91lpezYx%w*%4h`+>A>-uY-v+Bbqk1hRO8r~MQI=dSx%ySJY5*pI@YpJ6X@ z@q;Mc@S|(@0`7N%TWK1FN%!InEXaRwSgTzRS&0486hFcqtZU4VTc)hN=ksX$PL}rj zm?vk{(%$U_?X7+qCYhYQ9k4+=xOw~Q$*l-S;14jl9Hl<W zVLNM+8XSGhNkx1ag+LjqYkW`Z=uBtEL&r5$*<=>V7PDF5q2}7G$Q-;2-_tD3EkeCY zvWB&Hu4NgWOq#S|`3H}I5@A9GfAJJZre*p_+ch=~ria={|1%Pys;&WY=yj)10$6vwNpIuq$rE#zlbH5kH;fl!8 z_ErMPtxTk{;`jS2ag5T}PX`rw^42_>&yP z1(1=p3CJ5kdS?J_9fK*U?O7wu(=#J8Y2*9WNPjfla&cWY{Hfr)`$5?4W#eDs8IurY zjVr_S5~ZvkwJd2RKCIWy^}9l{fh~|Cz@o{X{EaAR*|O|2Heo0AZ}JwH1Jckf-TCFbF+6o#EQ;GPnb;UjGI^J#ehiXn z`#S!{2*--}4ld}5{Lcw3BiX-(8q5OW*lO{_u{uUZ#Q2;f9AuvFsohYMDzzzhkX60-}7a$ohDiEAj_q4AR1{A;=K&cq7E`|)QvL!j#1YgeK>bq zLNxT4fTk9q1k}WQsEdAvD3qoHF->YF)utOIZBV;ayUf?~Hnpim9jqH@*q?hgDY?@+gT2-%_j@_s$?QEmz*h7Z(4*ueOkSRbZaMBP>5HM+v z0~0H;GTMu*!m4;XY?js8%tIZqMFVOx2fsp|>lP0P6jOc-uSn2zLh5vuP{_1hn6-x( z1{5j_ia180)$r0Gn&hS{R{Ze10+tn^RP2e&=h?&##qt)Mi)dNA3bwK!awlgi2Ot-Y zj8`pQ-_Lq2U1{ZsQ&yrU#TH~}j$Y8q`tbDeO&tOj?P&UtDU*S!-Tf^Jv^yP$tV~Yd z0Z5TlnY{-++$BRbv3o<6ELPan^D-;m(0*e9=4!xL^U?X3Cyu^*HnW*CotqVMj-SBr zVcG_rmiiuXu(KkJ-ewiRLLqZlmDN}s<>IKYSsE4D3~O|CHoIr)+NeNhx!%&pKgB*E zs4w9){uT%vYv6BfdpwAgoAu$Z?8G;seAkrYWr0pK9LhJQc<{`JIAZ&7fleT znb9!HdR2Ff22`%DSAlp9(5gPCTgLFI6DoLw$zuq3poQc%4_(9~xQa~x|05H+sO7G5 zha88!Um&-MGDWoddaEdnq%ZCJy#PU|jfgbqNHg&hSqAa4)Jq02zXj&9)D+@7D*ZC!FhNLB7yiA#+n_i+O%Ohp zRK3H4yU@TC3X^ha5hjh5hc$VI-AD)c6b4B>q4zfgUmf}1yJpD}|Z&M*}Qz2=S_QRH8MJaf5GTdDApP|xiB2N<`uOv+< z3nER-kanLVc5fAOeJOrSIus_dxIHP9+lK*u9RLs^x0z-DB{&xZ{Mgk=Ejx7t=oLf2 z1{tFu>n=mn5Kh3T&Vg3RT_1B*%TJhmOzTJki})+8BbR(s>o(!U20#$NHjIqvnH8!C zjNBVq^i5mom4b9u*h7L2?JFK=kh89U2Ju$CJ-|+5FWuNg$|A-;#D{YH+(D^ja?*GH zyF|VPGHk3-lGE&jNpLiUY?-2PekrOC2LkvA) zm?KpWV^mlHF`D!Gq74C_G}#kVjtP%~IWk)%yaxQ{6NFcX$AV}4wi4JrAr-<_ctK<^ z-!T~CwKE7n0!k5-(Nitud~2^McVD~$@QFt6?EpK7f5nK@~9 z&|ZAJJuB~5hzl$I7$Gx_?P{$2r}~bAcLnbfzFx&D%1qm-fPZaN{#5%6-W6Y~j+{|x z*MRq~!v9ux&^M}X*?fCc+rfQHqTh`Vzk0Z?QH9liYyx*@)`y?uXW8>bGfA`NMi3-T zrN;qlru@hmFE`(XO*WsPvD0x!jt=v<$d>cyb{l6foc#!>(!3L7&9&SMULIBuvgLOV zmz!gTK|j-98kVn3E*35~;fV_{PD-Hh1~{`;dE)ZNQ-A$mtKE)z{&Tf^Nj*PV<@CYs zKm0vC|F$}r@vqZ?|2!S|Pph0_saAPRq3f8yeN66QOa?b5#Tk=V8&kSFCPU;d5duvW zx)0CZ^syyc7vfG}7mB!a*a&g=v5nQ6{I@VvmM|dSCDJ^y{uZuMhzA*qEPum~2Elur z1C)ysRqyUhREO38Xm;`>lG8Yy}`2WXAu1 z_&3yj%;6e^Tu?U=-bDy>y4o-ha&803CeOv3Z%4@bbn;~VHC3O6II^7e;3|4)!~})E z*qV^@xaP>zS>;UBSNNy2l+wX)X#?;F!AbkC2h@zvDmVXom?S9If%zK_!jc9;lrjPq z$5_&r^m!y__!$;j=g@EAXZ)gUm32ce{9<_a1kxVo&3^#CZ93WMLq|=2I%x5mAQ>`$ zWn{SI9O;{6%;w0T_h-4OQ~nQZ-7|hq`@RZokngojs{O_`r8Ux`AIQEYnkPv~P5W;eCr8z%(c$v27nF1ioHCYP^7Dcrlb zio%ouLfeO!u7ZC4ER~u>$akoVkUZ9tr2tVNvMvPW2#TXYoQO%Hf5fZ&B~6;mKNm1> z@_mu7{I7_uTK?=~3;PlJ)D3_G^+F;GWm}>+xcUF>2w96#D1;;*691b-)`+}HL`5jd zwd{+#MoS#n-o3V|1 zG`j!r1%E_5=_oj9f^3K_mCP*DooS5ABu>g9ZO9RqWIZYuA9Od;6o5hMFK4C`Cdo>U zKLm-~Awn>z;7Fc67)jB<;ra3k@mZMm0(&M=I3gpJEAtu+sH*gNS%q|hldo1O&$KO=f_V)}ww(8=-qMe4gsv|-FW{eL_y3YY)@ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 2e9d1bf..3318a4b 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -5,18 +5,7 @@ from sklearn.ensemble import RandomForestClassifier import numpy as np from xgboost import XGBClassifier from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MinMaxScaler, StandardScaler -from keras.models import Sequential, Model -from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional -from keras.optimizers import Adam -from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau -from keras.models import load_model -from sklearn.feature_selection import SelectKBest, f_classif -from tensorflow.keras.backend import clear_session -from keras import regularizers -from keras.layers import Layer -from tensorflow.keras import backend as K +from sklearn.preprocessing import MinMaxScaler from tqdm import tqdm from collections import defaultdict @@ -26,62 +15,11 @@ import aiofiles import pickle import time -class SelfAttention(Layer): - def __init__(self, **kwargs): - super(SelfAttention, self).__init__(**kwargs) - - def build(self, input_shape): - self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), - initializer='random_normal', trainable=True) - super(SelfAttention, self).build(input_shape) - - def call(self, x): - # Alignment scores. Pass them through tanh function - e = K.tanh(K.dot(x, self.W)) - # Remove dimension of size 1 - e = K.squeeze(e, axis=-1) - # Compute the weights - alpha = K.softmax(e) - # Reshape to tensor of same shape as x for multiplication - alpha = K.expand_dims(alpha, axis=-1) - # Compute the context vector - context = x * alpha - context = K.sum(context, axis=1) - return context, alpha - - def compute_output_shape(self, input_shape): - return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1]) - - class ScorePredictor: def __init__(self): self.scaler = MinMaxScaler() - self.model = None - self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras' - - def build_model(self): - clear_session() - - inputs = Input(shape=(231,)) - - x = Dense(128, activation='leaky_relu')(inputs) - x = BatchNormalization()(x) - x = Dropout(0.2)(x) - - for units in [64,32,16]: - x = Dense(units, activation='leaky_relu')(x) - x = BatchNormalization()(x) - x = Dropout(0.2)(x) - - x = Reshape((16, 1))(x) - x, _ = SelfAttention()(x) - outputs = Dense(2, activation='softmax')(x) - - model = Model(inputs=inputs, outputs=outputs) - optimizer = Adam(learning_rate=0.01, clipnorm=1.0) - model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) - - return model + self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' + self.model = XGBClassifier(n_estimators=100, max_depth = 10, min_samples_split=5, random_state=42, n_jobs=10) def preprocess_data(self, X): X = np.where(np.isinf(X), np.nan, X) @@ -91,37 +29,24 @@ class ScorePredictor: def warm_start_training(self, X_train, y_train): X_train = self.preprocess_data(X_train) - self.model = self.build_model() - - checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=30, min_lr=0.001) - - self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) - self.model.save(self.warm_start_model_path) + self.model.fit(X_train, y_train) + pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb')) print("Warm start model saved.") def fine_tune_model(self, X_train, y_train): X_train = self.preprocess_data(X_train) - #batch_size = min(64, max(16, len(X_train) // 10)) - if self.model is None: - self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention}) - - #early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) - #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.01) + with open(f'{self.warm_start_model_path}', 'rb') as f: + self.model = pickle.load(f) - self.model.fit(X_train, y_train, epochs=150, batch_size=16, validation_split=0.1) + self.model.fit(X_train, y_train) print("Model fine-tuned") def evaluate_model(self, X_test, y_test): X_test = self.preprocess_data(X_test) - if self.model is None: - raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.") - - test_predictions = self.model.predict(X_test) + test_predictions = self.model.predict_proba(X_test) class_1_probabilities = test_predictions[:, 1] binary_predictions = (class_1_probabilities >= 0.5).astype(int) #print(test_predictions) @@ -146,15 +71,4 @@ class ScorePredictor: return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), - 'score': score} - - def feature_selection(self, X_train, y_train, k=100): - print('Feature selection:') - print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") - selector = SelectKBest(score_func=f_classif, k=k) - selector.fit(X_train, y_train) - - selector.transform(X_train) - selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]] - - return selected_features \ No newline at end of file + 'score': score} \ No newline at end of file