bugfixing ai model

2024-09-30 14:23:09 +02:00 · 2024-09-30 14:23:09 +02:00 · 5a220c85dd
commit 5a220c85dd
parent 3b70c93d28
3 changed files with 78 additions and 51 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -22,7 +22,7 @@ import gc
 gc.enable()

 async def save_json(symbol, data):
-    with open(f"json/ai-score/{symbol}.json", 'wb') as file:
+    with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file:
        file.write(orjson.dumps(data))


@ -317,23 +317,35 @@ async def process_symbol(ticker, con, start_date, end_date):
        split_size = int(len(df) * (1-test_size))
        test_data = df.iloc[split_size:]
        best_features = [col for col in df.columns if col not in ['date','price','Target']]
-        data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target'])
-        
-        print(data)
-        '''
-        output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} 
-                                for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])]
-        '''
-        #print(output_list)
+        data = predictor.evaluate_model(test_data[best_features], test_data['Target'])

        if len(data) != 0:
            if data['precision'] >= 50 and data['accuracy'] >= 50:
-                await save_json(ticker, data)
+                res = {'score': data['score']}
+                await save_json(ticker, res)
    
    except Exception as e:
        print(e)


+async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
+    # Helper function to divide the tickers into chunks
+    def chunks(lst, size):
+        for i in range(0, len(lst), size):
+            yield lst[i:i+size]
+    
+    results = []
+    
+    for chunk in chunks(tickers, chunk_size):
+        # Create tasks for each chunk
+        tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk]
+        # Await the results for the current chunk
+        chunk_results = await asyncio.gather(*tasks)
+        # Accumulate the results
+        results.extend(chunk_results)
+    
+    return results
+
 #Train mode
 async def train_process(tickers, con):
    tickers = list(set(tickers))
@ -345,8 +357,8 @@ async def train_process(tickers, con):
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

-    tasks = [download_data(ticker, con, start_date, end_date) for ticker in tickers]
-    dfs = await asyncio.gather(*tasks)
+    dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
+
    for df in dfs:
        try:
            split_size = int(len(df) * (1-test_size))
@ -373,17 +385,6 @@ async def train_process(tickers, con):
    predictor.train_model(df_train[selected_features], df_train['Target'])
    predictor.evaluate_model(df_test[best_features], df_test['Target'])

-async def test_process(con):
-    test_size = 0.2
-    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
-    end_date = datetime.today().strftime("%Y-%m-%d")
-    predictor = ScorePredictor()
-    df = await download_data('GME', con, start_date, end_date)
-    split_size = int(len(df) * (1-test_size))
-    test_data = df.iloc[split_size:]
-    selected_features = [col for col in test_data if col not in ['price','date','Target']]
-    predictor.evaluate_model(test_data[selected_features], test_data['Target'])
-

 async def run():

@ -393,20 +394,21 @@ async def run():
    
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
-    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'")
-    stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()]
+    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
+    stock_symbols = [row[0] for row in cursor.fetchall()] #['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR']
    stock_symbols = list(set(stock_symbols))
    print('Number of Stocks')
    print(len(stock_symbols))
-    #await train_process(stock_symbols, con)
+    await train_process(stock_symbols, con)
+    
+
+


    #Prediction Steps for all stock symbols
-    
-    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
-    stock_symbols = [row[0] for row in cursor.fetchall()]
-
-    total_symbols = ['GME'] #stock_symbols
+    #cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
+    #stock_symbols = [row[0] for row in cursor.fetchall()]
+    total_symbols = stock_symbols

    print(f"Total tickers: {len(total_symbols)}")
    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -41,12 +41,12 @@ class ScorePredictor:
        inputs = Input(shape=(2139,))
        
        # First dense layer
-        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
+        x = Dense(2048, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        
        # Additional dense layers
-        for units in [512,256, 256]:
+        for units in [1024,512, 256, 256]:
            x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
            x = Dropout(0.2)(x)
            x = BatchNormalization()(x)
@ -64,17 +64,17 @@ class ScorePredictor:
        # Global average pooling
        x = GlobalAveragePooling1D()(x)
        
-        # Output layer
-        outputs = Dense(1, activation='sigmoid')(x)
+        # Output layer (for class probabilities)
+        outputs = Dense(2, activation='softmax')(x)  # Two neurons for class probabilities with softmax
        
        # Create the model
        model = Model(inputs=inputs, outputs=outputs)
        
        # Optimizer with a lower learning rate
-        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
+        optimizer = Adam(learning_rate=0.001, clipnorm=1.0)
        
        # Compile the model
-        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
+        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        
        return model

@ -92,38 +92,63 @@ class ScorePredictor:
        X_train = self.preprocess_data(X_train)
        #X_train = self.reshape_for_lstm(X_train)
        
-        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
+        checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', 
                                      save_best_only=True, save_freq = 1,
                                      monitor='val_loss', mode='min')
-        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
-        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)
+        early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=80, min_lr=0.00001)

        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
-        self.model.save('ml_models/weights/fundamental_weights/weights.keras')
+        self.model.save('ml_models/weights/ai-score/weights.keras')

    def evaluate_model(self, X_test, y_test):
+        # Preprocess the test data
        X_test = self.preprocess_data(X_test)
-        X_test = self.reshape_for_lstm(X_test)
+        #X_test = self.reshape_for_lstm(X_test)
        
-        self.model = load_model('ml_models/weights/fundamental_weights/weights.keras')
+        # Load the trained model
+        self.model = load_model('ml_models/weights/ai-score/weights.keras')
        
-        test_predictions = self.model.predict(X_test).flatten()
+        # Get the model's predictions
+        test_predictions = self.model.predict(X_test)
+        #print(test_predictions)

-        test_predictions[test_predictions >= 0.5] = 1
-        test_predictions[test_predictions < 0.5] = 0
+        # Extract the probabilities for class 1 (index 1 in the softmax output)
+        class_1_probabilities = test_predictions[:, 1]
+        # Convert probabilities to binary predictions using a threshold of 0.5
+        binary_predictions = (class_1_probabilities >= 0.5).astype(int)
        
-        test_precision = precision_score(y_test, test_predictions)
-        test_accuracy = accuracy_score(y_test, test_predictions)
+        # Calculate precision and accuracy using binary predictions
+        test_precision = precision_score(y_test, binary_predictions)
+        test_accuracy = accuracy_score(y_test, binary_predictions)
        
        print("Test Set Metrics:")
        print(f"Precision: {round(test_precision * 100)}%")
        print(f"Accuracy: {round(test_accuracy * 100)}%")
        
-        next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0
+        # Define thresholds and corresponding scores
+        thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2]
+        scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+
+        # Get the last prediction value (class 1 probability) for scoring
+        last_prediction_prob = class_1_probabilities[-1]
+
+        # Initialize score to 0 (or any default value)
+        score = 0
+        #print(last_prediction_prob)
+        # Determine the score based on the last prediction probability
+        for threshold, value in zip(thresholds, scores):
+            if last_prediction_prob >= threshold:
+                score = value
+                break  # Exit the loop once the score is determined
+
+        # Return the evaluation results
        return {'accuracy': round(test_accuracy * 100), 
                'precision': round(test_precision * 100), 
-                'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions
+                'score': score}
+
+

    def feature_selection(self, X_train, y_train, k=100):
        print('feature selection:')