From 5a220c85dd6990baac7be7f89cbb57c0c0769591 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Mon, 30 Sep 2024 14:23:09 +0200 Subject: [PATCH] bugfixing ai model --- app/cron_ai_score.py | 64 ++++++++--------- .../__pycache__/score_model.cpython-310.pyc | Bin 4941 -> 5176 bytes app/ml_models/score_model.py | 65 ++++++++++++------ 3 files changed, 78 insertions(+), 51 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index f039203..55b1986 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -22,7 +22,7 @@ import gc gc.enable() async def save_json(symbol, data): - with open(f"json/ai-score/{symbol}.json", 'wb') as file: + with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file: file.write(orjson.dumps(data)) @@ -317,23 +317,35 @@ async def process_symbol(ticker, con, start_date, end_date): split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] best_features = [col for col in df.columns if col not in ['date','price','Target']] - data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target']) - - print(data) - ''' - output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} - for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])] - ''' - #print(output_list) + data = predictor.evaluate_model(test_data[best_features], test_data['Target']) if len(data) != 0: if data['precision'] >= 50 and data['accuracy'] >= 50: - await save_json(ticker, data) + res = {'score': data['score']} + await save_json(ticker, res) except Exception as e: print(e) +async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): + # Helper function to divide the tickers into chunks + def chunks(lst, size): + for i in range(0, len(lst), size): + yield lst[i:i+size] + + results = [] + + for chunk in chunks(tickers, chunk_size): + # Create tasks for each chunk + tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk] + # Await the results for the current chunk + chunk_results = await asyncio.gather(*tasks) + # Accumulate the results + results.extend(chunk_results) + + return results + #Train mode async def train_process(tickers, con): tickers = list(set(tickers)) @@ -345,8 +357,8 @@ async def train_process(tickers, con): df_train = pd.DataFrame() df_test = pd.DataFrame() - tasks = [download_data(ticker, con, start_date, end_date) for ticker in tickers] - dfs = await asyncio.gather(*tasks) + dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) + for df in dfs: try: split_size = int(len(df) * (1-test_size)) @@ -373,17 +385,6 @@ async def train_process(tickers, con): predictor.train_model(df_train[selected_features], df_train['Target']) predictor.evaluate_model(df_test[best_features], df_test['Target']) -async def test_process(con): - test_size = 0.2 - start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") - end_date = datetime.today().strftime("%Y-%m-%d") - predictor = ScorePredictor() - df = await download_data('GME', con, start_date, end_date) - split_size = int(len(df) * (1-test_size)) - test_data = df.iloc[split_size:] - selected_features = [col for col in test_data if col not in ['price','date','Target']] - predictor.evaluate_model(test_data[selected_features], test_data['Target']) - async def run(): @@ -393,21 +394,22 @@ async def run(): cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'") - stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()] + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") + stock_symbols = [row[0] for row in cursor.fetchall()] #['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] stock_symbols = list(set(stock_symbols)) print('Number of Stocks') print(len(stock_symbols)) - #await train_process(stock_symbols, con) + await train_process(stock_symbols, con) + + + #Prediction Steps for all stock symbols - - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") - stock_symbols = [row[0] for row in cursor.fetchall()] + #cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") + #stock_symbols = [row[0] for row in cursor.fetchall()] + total_symbols = stock_symbols - total_symbols = ['GME'] #stock_symbols - print(f"Total tickers: {len(total_symbols)}") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 052553596b726e63a9b16bf2fd8f03c99ca10e8a..bc1b977eed4ca5c2acc2355e0c5003e3837df8a1 100644 GIT binary patch delta 1248 zcmYjQUuYaf7@yhyyZ0xT*l3GrQfwt#n>MvI)mpkO&4UUp!TJzV!?D@jyIZq&d(O^j zQujFIQ2J2vG7r+KaQfCq34(&pg3y2{2#&r;rR`Eg`=&*z-|Sti-!i|M`F`Iw-?y{B z-G2IunQ~kzSqSX;5AN4q-t$@cD^(uz%WPXchYj|)cC>f*D}iPvRPFy6YJvBU|EH*lKrZ!u7aA#iIWA^?To) zTU~VOz=Qn(=kZb$xa@{D`N&k#c1aRcJ(mXcn1r4iu{+wk`%ge5$Ln0}v_d${bSVv5 zG>lv1MPSG?0#0vj>R0fiq6;`uCqdG1+X0yq+<7)<@`E$YsM?JEAW9Cu9fb7;O%A)^ zY~sZv*l`?M3y7P%Mvk*HmSgzhESW5^%hv44McCBuHEv%1{Lf!c*$P>LJ=bckM`J=R z=9^~NszxLnypK^fEy9O)_NBvqv*z(4+gJFyR|dZd<1CEi7C?#yD8--PE~ePWeVn4I zl%hAJft-8gRQ8dN-jjT(Chb98B~^TRO&usyg&dV?sg$aHsjE?aw9={JD}9U+%qlfg z&DVCJKK>B-`p0r_L>NHm|0jgMj(qb|+%;gfU^byt0ZLkav0I>4YNTdgT0kqs*QIW; zTS~38xTX?6Ep-bRweXuu56^s9HZAz2kK_x;E}thC5Hf~3AuH^c;!{1c3>R=8@4sZ7 z@w^Uky^Ryowl%u8Y)2z^oO&&{TTXo_(wh!hfz0~$4Pie!ql_PS0)KGGG2qEOfNf<$ z_?aod5TFat0D77D5*dgcWikMHL4Z(D_74~l5{K{az{Jfq3A``?&2crCj9nl?5@bq% zP$1VK#1B2nH8-If?SRW+ixN?Yhz_{gCg9*Ii90Qy%a=mAZI{_*Y39jEFLINldZgMW z@v6HTMj;J@giox7EthPt>!pLIE!q&0jw3(eTHZjycR?0~oV+S@mZy^qQOo=<_M7jV@0>F;J7-_@tal~LiG+?| zJiqp{(B1MPxu!@7+QiqD^H}D4)xNP}UwwQWG9()jglY&Q#4?aleDqL^cS8oL+cgk2 z6$7&)g7*W2nqMvit~mq5F1lr#@)dP^;wS+rXR&4e1Giif(E7n?=Zan2Y@PCMpe9m z=P@w6$pS+0BG@^pimpn_vhXe|D`;M#q$*VvL zNHd9(ILtbZkX?8e-T_R+MCR>H?bux%Zfe7cyO83;|AEqIF2hfy()>YlYs)k^ zHv2LySi1m=ziHkdD+*c?aFc&&-iL4VL~1X-$&FMNyS$LfwER1$Bf@S7SmMu88QjV@ NQs?m5)!uDU@)u6i12+Hw diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index ced08bb..6161b9c 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -41,12 +41,12 @@ class ScorePredictor: inputs = Input(shape=(2139,)) # First dense layer - x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) + x = Dense(2048, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) x = Dropout(0.3)(x) x = BatchNormalization()(x) # Additional dense layers - for units in [512,256, 256]: + for units in [1024,512, 256, 256]: x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x) x = Dropout(0.2)(x) x = BatchNormalization()(x) @@ -64,17 +64,17 @@ class ScorePredictor: # Global average pooling x = GlobalAveragePooling1D()(x) - # Output layer - outputs = Dense(1, activation='sigmoid')(x) + # Output layer (for class probabilities) + outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax # Create the model model = Model(inputs=inputs, outputs=outputs) # Optimizer with a lower learning rate - optimizer = Adam(learning_rate=0.1, clipnorm = 1.0) + optimizer = Adam(learning_rate=0.001, clipnorm=1.0) # Compile the model - model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) + model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model @@ -92,38 +92,63 @@ class ScorePredictor: X_train = self.preprocess_data(X_train) #X_train = self.reshape_for_lstm(X_train) - checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', + checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', save_best_only=True, save_freq = 1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001) + early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=80, min_lr=0.00001) self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) - self.model.save('ml_models/weights/fundamental_weights/weights.keras') + self.model.save('ml_models/weights/ai-score/weights.keras') def evaluate_model(self, X_test, y_test): + # Preprocess the test data X_test = self.preprocess_data(X_test) - X_test = self.reshape_for_lstm(X_test) + #X_test = self.reshape_for_lstm(X_test) - self.model = load_model('ml_models/weights/fundamental_weights/weights.keras') + # Load the trained model + self.model = load_model('ml_models/weights/ai-score/weights.keras') - test_predictions = self.model.predict(X_test).flatten() + # Get the model's predictions + test_predictions = self.model.predict(X_test) + #print(test_predictions) + + # Extract the probabilities for class 1 (index 1 in the softmax output) + class_1_probabilities = test_predictions[:, 1] + # Convert probabilities to binary predictions using a threshold of 0.5 + binary_predictions = (class_1_probabilities >= 0.5).astype(int) - test_predictions[test_predictions >= 0.5] = 1 - test_predictions[test_predictions < 0.5] = 0 - - test_precision = precision_score(y_test, test_predictions) - test_accuracy = accuracy_score(y_test, test_predictions) + # Calculate precision and accuracy using binary predictions + test_precision = precision_score(y_test, binary_predictions) + test_accuracy = accuracy_score(y_test, binary_predictions) print("Test Set Metrics:") print(f"Precision: {round(test_precision * 100)}%") print(f"Accuracy: {round(test_accuracy * 100)}%") - next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0 + # Define thresholds and corresponding scores + thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2] + scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + + # Get the last prediction value (class 1 probability) for scoring + last_prediction_prob = class_1_probabilities[-1] + + # Initialize score to 0 (or any default value) + score = 0 + #print(last_prediction_prob) + # Determine the score based on the last prediction probability + for threshold, value in zip(thresholds, scores): + if last_prediction_prob >= threshold: + score = value + break # Exit the loop once the score is determined + + # Return the evaluation results return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), - 'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions + 'score': score} + + def feature_selection(self, X_train, y_train, k=100): print('feature selection:')