From 5a220c85dd6990baac7be7f89cbb57c0c0769591 Mon Sep 17 00:00:00 2001
From: MuslemRahimi <moslem_rahimi@hotmail.de>
Date: Mon, 30 Sep 2024 14:23:09 +0200
Subject: [PATCH] bugfixing ai model

---
 app/cron_ai_score.py                          |  64 ++++++++---------
 .../__pycache__/score_model.cpython-310.pyc   | Bin 4941 -> 5176 bytes
 app/ml_models/score_model.py                  |  65 ++++++++++++------
 3 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py
index f039203..55b1986 100644
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@@ -22,7 +22,7 @@ import gc
 gc.enable()
 
 async def save_json(symbol, data):
-    with open(f"json/ai-score/{symbol}.json", 'wb') as file:
+    with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file:
         file.write(orjson.dumps(data))
 
 
@@ -317,23 +317,35 @@ async def process_symbol(ticker, con, start_date, end_date):
         split_size = int(len(df) * (1-test_size))
         test_data = df.iloc[split_size:]
         best_features = [col for col in df.columns if col not in ['date','price','Target']]
-        data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target'])
-        
-        print(data)
-        '''
-        output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} 
-                                for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])]
-        '''
-        #print(output_list)
+        data = predictor.evaluate_model(test_data[best_features], test_data['Target'])
 
         if len(data) != 0:
             if data['precision'] >= 50 and data['accuracy'] >= 50:
-                await save_json(ticker, data)
+                res = {'score': data['score']}
+                await save_json(ticker, res)
     
     except Exception as e:
         print(e)
 
 
+async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
+    # Helper function to divide the tickers into chunks
+    def chunks(lst, size):
+        for i in range(0, len(lst), size):
+            yield lst[i:i+size]
+    
+    results = []
+    
+    for chunk in chunks(tickers, chunk_size):
+        # Create tasks for each chunk
+        tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk]
+        # Await the results for the current chunk
+        chunk_results = await asyncio.gather(*tasks)
+        # Accumulate the results
+        results.extend(chunk_results)
+    
+    return results
+
 #Train mode
 async def train_process(tickers, con):
     tickers = list(set(tickers))
@@ -345,8 +357,8 @@ async def train_process(tickers, con):
     df_train = pd.DataFrame()
     df_test = pd.DataFrame()
 
-    tasks = [download_data(ticker, con, start_date, end_date) for ticker in tickers]
-    dfs = await asyncio.gather(*tasks)
+    dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
+
     for df in dfs:
         try:
             split_size = int(len(df) * (1-test_size))
@@ -373,17 +385,6 @@ async def train_process(tickers, con):
     predictor.train_model(df_train[selected_features], df_train['Target'])
     predictor.evaluate_model(df_test[best_features], df_test['Target'])
 
-async def test_process(con):
-    test_size = 0.2
-    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
-    end_date = datetime.today().strftime("%Y-%m-%d")
-    predictor = ScorePredictor()
-    df = await download_data('GME', con, start_date, end_date)
-    split_size = int(len(df) * (1-test_size))
-    test_data = df.iloc[split_size:]
-    selected_features = [col for col in test_data if col not in ['price','date','Target']]
-    predictor.evaluate_model(test_data[selected_features], test_data['Target'])
-
 
 async def run():
 
@@ -393,21 +394,22 @@ async def run():
     
     cursor = con.cursor()
     cursor.execute("PRAGMA journal_mode = wal")
-    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'")
-    stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()]
+    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
+    stock_symbols = [row[0] for row in cursor.fetchall()] #['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR']
     stock_symbols = list(set(stock_symbols))
     print('Number of Stocks')
     print(len(stock_symbols))
-    #await train_process(stock_symbols, con)
+    await train_process(stock_symbols, con)
     
 
+
+
+
     #Prediction Steps for all stock symbols
-    
-    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
-    stock_symbols = [row[0] for row in cursor.fetchall()]
+    #cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
+    #stock_symbols = [row[0] for row in cursor.fetchall()]
+    total_symbols = stock_symbols
 
-    total_symbols = ['GME'] #stock_symbols
-    
     print(f"Total tickers: {len(total_symbols)}")
     start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
     end_date = datetime.today().strftime("%Y-%m-%d")
diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc
index 052553596b726e63a9b16bf2fd8f03c99ca10e8a..bc1b977eed4ca5c2acc2355e0c5003e3837df8a1 100644
GIT binary patch
delta 1248
zcmYjQUuYaf7@yhyyZ0xT*l3GrQfwt#n>MvI)mpkO&4UUp!TJzV!?D@jyIZq&d(O^j
zQujFIQ2J2vG7r+KaQfCq34(&pg3y2{2#&r;rR`Eg`=&*z-|Sti-!i|M`F`Iw-?y{B
z-G2IunQ~kzSqSX;5AN4q-t$@cD^(uz%WPXchYj|)cC>f<Xl~y-wh|%*p%6(3We6ig
zo`byG*RBft#<8V*p8+vsY)qaN>*D}iPvRPFy6YJvBU|EH*lKrZ!u7aA#iIWA^?To)
zTU~VOz=Qn(=kZb$xa@{D`N&k#c1aRcJ(mXcn1r4iu{+wk`%ge5$Ln0}v_d${bSVv5
zG>lv1MPSG?0#0vj>R0fiq6;`uCqdG1+X0yq+<7)<@`E$YsM?JEAW9Cu9fb7;O%A)^
zY~sZv*l`?M3y7P%Mvk*HmSgzhESW5^%hv44McCBuHEv%1{Lf!c*$P>LJ=bckM`J=R
z=9^~NszxLnypK^fEy9O)_NBvqv*z(4+gJFyR|dZd<1CEi7C?#yD8--PE~ePWeVn4I
zl%hAJft-8gRQ8dN-jjT(Chb98B~^TRO&usyg&dV?sg$aHsjE?aw9={JD}9U+%qlfg
z&DVCJKK>B-`p0r_L>NHm|0jgMj(qb|+%;gfU^byt0ZLkav0I>4YNTdgT0kqs*QIW;
zTS~38xTX?6Ep-bRweXuu56^s9HZAz2kK_x;E}thC5Hf~3AuH^c;!{1c3>R=8@4sZ7
z@w^Uky^Ryowl%u8Y)2z^oO&&{TTXo_(wh!hfz0~$4Pie!ql_PS0)KGGG2qEOfNf<$
z_?aod5TFat0D77D5*dgcWikMHL4Z(D_74~l5{K{az{Jfq3A``?&2crCj9nl?5@bq%
zP$1VK#1B2nH8-If?SRW+ixN?Yhz_{gCg9*Ii90Qy%a=mAZI{_*Y39jEFLINldZgMW
z@v6HTMj;J@giox7EthPt>!pLIE!q&0jw3(eTHZjycR?0~oV+S@mZy^qQOo=<oU3oS
zQ72$`OZ$!-0?Lvsr6^aFvSi5?Hl*EnH#TGgh9XbEtiU5Zuv806SYz|$y?A+RsXT@8
zfUS?6o$QL&$$zS8vDz!3#=adpu+R|8P{1YtA5VfP@Mz#yYk^BU&^(zHBoiPS;o~Es
zI*f!DxXMmdPU2<uPUYBSez~uT;HH2p0<N*&Cmv%zRi40e?61lz_@%9daZ~yi7u;k2

delta 966
zcmYjQTWb?R6rP#AC7VkdwbmOnh2B=ticPgjY|(;(pry4?5FxCYO|o`(Q)f4|EvZ$C
z7o_0BJb8om7pN3LvCo2FU-WHXElA!J5k&BECQZ><_M7jV@0>F;J7-_@tal~LiG+?|
zJiqp{(B1MPxu!@7+QiqD^H}D4)xNP}UwwQWG9()jglY&Q#4?aleDqL^cS8oL+cgk2
z6$7&)g7*W2nqMvit~mq5F1lr#@)dP^<nw0oYrdv-Y1^;Zp2=qUclB21fXKl!p;Gbe
zzz?Hl5LjMdmpyh|lsq6{X!WtSgm;Sa1_?E0`6YA8Vuyuxo}Y|{$NF3+=a#AE_`TOH
zyI2bR-a^HrrVF)9C%5Hm%!ei|X8Pl-pAYLpku##K#3uNC{op_rjN;eQr+Y7dYz!JQ
zn*cjh3uZnjGj^$NxVD#bn7w=lqgom~>;wS+rXR&4e1Giif(E7n?=Zan2Y@PCMpe9m
z=P@w6$pS+0BG@^pimpn_vhXe|D`;M#q$*V<DorX`MCArYSrn*MP!`F&9B2)u1QDt%
zVvJx`gJ@Nz+77g^h#w;wc`7x8C<uxW{`bQI`fel^VWX6In9B2_KqGFe6l_qXWRkEv
zjX_MDCZ5373gjX(k{8$*9Ibxk?6P2@8|G5u7XKCBw@{;C{luMf`l)=r!p!{aiFEA`
zj8HpWaU9z(0kKR51U~&B1c%WnX60=kZa$3qq6Ng=89Eyg#YY9S2@nktY7k59d=P2{
zr`|A36`JVmP?=(O%nL$=m7xO+jqgeJa5veuv&FM!g50#}RIFSh9~R^9lO5e1kf>vL
zNHd9(ILtbZkX?8e-T_R<Ni6>+MCR>H?bux%Zfe7cyO83;|AEqIF2hfy()>YlYs)k^
zHv2LySi1m=ziHkdD+*c?aFc&&-iL4VL~1X-$&FMNyS$LfwER1$Bf@S7SmMu88QjV@
NQs?m5)!uDU@)u6i12+Hw

diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py
index ced08bb..6161b9c 100644
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@@ -41,12 +41,12 @@ class ScorePredictor:
         inputs = Input(shape=(2139,))
         
         # First dense layer
-        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
+        x = Dense(2048, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
         x = Dropout(0.3)(x)
         x = BatchNormalization()(x)
         
         # Additional dense layers
-        for units in [512,256, 256]:
+        for units in [1024,512, 256, 256]:
             x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
             x = Dropout(0.2)(x)
             x = BatchNormalization()(x)
@@ -64,17 +64,17 @@ class ScorePredictor:
         # Global average pooling
         x = GlobalAveragePooling1D()(x)
         
-        # Output layer
-        outputs = Dense(1, activation='sigmoid')(x)
+        # Output layer (for class probabilities)
+        outputs = Dense(2, activation='softmax')(x)  # Two neurons for class probabilities with softmax
         
         # Create the model
         model = Model(inputs=inputs, outputs=outputs)
         
         # Optimizer with a lower learning rate
-        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
+        optimizer = Adam(learning_rate=0.001, clipnorm=1.0)
         
         # Compile the model
-        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
+        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         
         return model
 
@@ -92,38 +92,63 @@ class ScorePredictor:
         X_train = self.preprocess_data(X_train)
         #X_train = self.reshape_for_lstm(X_train)
         
-        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
+        checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', 
                                       save_best_only=True, save_freq = 1,
                                       monitor='val_loss', mode='min')
-        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
-        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)
+        early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=80, min_lr=0.00001)
 
         self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
                        validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
-        self.model.save('ml_models/weights/fundamental_weights/weights.keras')
+        self.model.save('ml_models/weights/ai-score/weights.keras')
 
     def evaluate_model(self, X_test, y_test):
+        # Preprocess the test data
         X_test = self.preprocess_data(X_test)
-        X_test = self.reshape_for_lstm(X_test)
+        #X_test = self.reshape_for_lstm(X_test)
         
-        self.model = load_model('ml_models/weights/fundamental_weights/weights.keras')
+        # Load the trained model
+        self.model = load_model('ml_models/weights/ai-score/weights.keras')
         
-        test_predictions = self.model.predict(X_test).flatten()
+        # Get the model's predictions
+        test_predictions = self.model.predict(X_test)
+        #print(test_predictions)
+
+        # Extract the probabilities for class 1 (index 1 in the softmax output)
+        class_1_probabilities = test_predictions[:, 1]
+        # Convert probabilities to binary predictions using a threshold of 0.5
+        binary_predictions = (class_1_probabilities >= 0.5).astype(int)
         
-        test_predictions[test_predictions >= 0.5] = 1
-        test_predictions[test_predictions < 0.5] = 0
-        
-        test_precision = precision_score(y_test, test_predictions)
-        test_accuracy = accuracy_score(y_test, test_predictions)
+        # Calculate precision and accuracy using binary predictions
+        test_precision = precision_score(y_test, binary_predictions)
+        test_accuracy = accuracy_score(y_test, binary_predictions)
         
         print("Test Set Metrics:")
         print(f"Precision: {round(test_precision * 100)}%")
         print(f"Accuracy: {round(test_accuracy * 100)}%")
         
-        next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0
+        # Define thresholds and corresponding scores
+        thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2]
+        scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+
+        # Get the last prediction value (class 1 probability) for scoring
+        last_prediction_prob = class_1_probabilities[-1]
+
+        # Initialize score to 0 (or any default value)
+        score = 0
+        #print(last_prediction_prob)
+        # Determine the score based on the last prediction probability
+        for threshold, value in zip(thresholds, scores):
+            if last_prediction_prob >= threshold:
+                score = value
+                break  # Exit the loop once the score is determined
+
+        # Return the evaluation results
         return {'accuracy': round(test_accuracy * 100), 
                 'precision': round(test_precision * 100), 
-                'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions
+                'score': score}
+
+
 
     def feature_selection(self, X_train, y_train, k=100):
         print('feature selection:')