update ai model

2024-09-30 11:52:07 +02:00 · 2024-09-30 11:52:07 +02:00 · 3b70c93d28
commit 3b70c93d28
parent 96c96254fc
6 changed files with 218 additions and 87 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -4,7 +4,7 @@ import aiohttp
 import aiofiles
 import sqlite3
 from datetime import datetime
-from ml_models.fundamental_predictor import FundamentalPredictor
+from ml_models.score_model import ScorePredictor
 import yfinance as yf
 from collections import defaultdict
 import pandas as pd
@ -22,7 +22,7 @@ import gc
 gc.enable()
 async def save_json(symbol, data):
-    with open(f"json/fundamental-predictor-analysis/{symbol}.json", 'wb') as file:
+    with open(f"json/ai-score/{symbol}.json", 'wb') as file:
        file.write(orjson.dumps(data))
@ -31,11 +31,6 @@ def trend_intensity(close, window=20):
    std = close.rolling(window=window).std()
    return ((close - ma) / std).abs().rolling(window=window).mean()
 def fisher_transform(high, low, window=10):
    value = (high + low) / 2
    norm_value = (2 * ((value - value.rolling(window=window).min()) / 
                       (value.rolling(window=window).max() - value.rolling(window=window).min())) - 1)
    return 0.5 * np.log((1 + norm_value) / (1 - norm_value))
 def calculate_fdi(high, low, close, window=30):
    n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
@ -185,8 +180,6 @@ async def download_data(ticker, con, start_date, end_date):
        df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
        df['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
        #df['hurst'] = df['close'].rolling(window=100).apply(hurst_exponent)
        df['fisher'] = fisher_transform(df['high'], df['low'])
        df['tii'] = trend_intensity(df['close'])
@ -196,7 +189,7 @@ async def download_data(ticker, con, start_date, end_date):
            'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover',
            'volatility','daily_return','cumulative_return', 'roc','avg_volume_30d',
            'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b',
-            'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','fisher'
+            'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi'
        ]
        # Match each combined data entry with the closest available stock price in df
@ -229,7 +222,6 @@ async def download_data(ticker, con, start_date, end_date):
        combined_data = sorted(combined_data, key=lambda x: x['date'])
        # Convert combined data into a DataFrame
        df_combined = pd.DataFrame(combined_data).dropna()
        key_elements = [
            'revenue',
            'costOfRevenue',
@ -275,37 +267,30 @@ async def download_data(ticker, con, start_date, end_date):
            'propertyPlantEquipmentNet',
            'ownersEarnings',
        ]
        # Compute ratios for all combinations of key elements
-        
+
        new_columns = {}
        # Loop over combinations of column pairs
        for num, denom in combinations(key_elements, 2):
-            # Compute ratio num/denom
+            # Compute ratio and reverse ratio
            ratio = df_combined[num] / df_combined[denom]
            reverse_ratio = df_combined[denom] / df_combined[num]
            # Define column names for both ratios
            column_name = f'{num}_to_{denom}'
            try:
                ratio = df_combined[num] / df_combined[denom]
                # Check for valid ratio
                df_combined[column_name] = np.where((ratio != 0) & 
                                                     (ratio != np.inf) & 
                                                     (ratio != -np.inf) & 
                                                     (~np.isnan(ratio)), 
                                                     ratio, 0)
            except Exception as e:
                print(f"Error calculating {column_name}: {e}")
                df_combined[column_name] = 0
            # Compute reverse ratio denom/num
            reverse_column_name = f'{denom}_to_{num}'
-            try:
+
-                reverse_ratio = df_combined[denom] / df_combined[num]
+            # Store the new columns in the dictionary, replacing invalid values with 0
-                # Check for valid reverse ratio
+            new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
-                df_combined[reverse_column_name] = np.where((reverse_ratio != 0) & 
+            new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
-                                                             (reverse_ratio != np.inf) & 
+
-                                                             (reverse_ratio != -np.inf) & 
+        # Add all new columns to the original DataFrame at once
-                                                             (~np.isnan(reverse_ratio)), 
+        df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
-                                                             reverse_ratio, 0)
+
-            except Exception as e:
+        # To defragment the DataFrame, make a copy
-                print(f"Error calculating {reverse_column_name}: {e}")
+        df_combined = df_combined.copy()
-                df_combined[reverse_column_name] = 0
+
        # Create 'Target' column based on price change
        df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
@ -314,7 +299,7 @@ async def download_data(ticker, con, start_date, end_date):
        df_combined = df_combined.dropna()
        df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
        df_copy = df_combined.copy()
-        #print(df_copy[['date','revenue','ownersEarnings','revenuePerShare']])
+
        return df_copy
    except Exception as e:
@ -327,14 +312,14 @@ async def process_symbol(ticker, con, start_date, end_date):
        test_size = 0.2
        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
        end_date = datetime.today().strftime("%Y-%m-%d")
-        predictor = FundamentalPredictor()
+        predictor = ScorePredictor()
        df = await download_data(ticker, con, start_date, end_date)
        split_size = int(len(df) * (1-test_size))
        test_data = df.iloc[split_size:]
        best_features = [col for col in df.columns if col not in ['date','price','Target']]
        data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target'])
-
+        print(data)
        '''
        output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} 
                                for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])]
@ -380,19 +365,19 @@ async def train_process(tickers, con):
    print('======Train Set Datapoints======')
    print(len(df_train))
-    predictor = FundamentalPredictor()
+    predictor = ScorePredictor()
    #print(selected_features)
    selected_features = [col for col in df_train if col not in ['price','date','Target']]
-    best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=100)
+    #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=5)
-    print(best_features)
+    #print(best_features)
-    predictor.train_model(df_train[best_features], df_train['Target'])
+    predictor.train_model(df_train[selected_features], df_train['Target'])
    predictor.evaluate_model(df_test[best_features], df_test['Target'])
 async def test_process(con):
    test_size = 0.2
    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")
-    predictor = FundamentalPredictor()
+    predictor = ScorePredictor()
    df = await download_data('GME', con, start_date, end_date)
    split_size = int(len(df) * (1-test_size))
    test_data = df.iloc[split_size:]
@ -405,25 +390,26 @@ async def run():
    #Train first model
    con = sqlite3.connect('stocks.db')
-
+    
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'")
-    stock_symbols = [row[0] for row in cursor.fetchall()] #['AAPL','GME','LLY','NVDA'] #
+    stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()]
    stock_symbols = list(set(stock_symbols))
    print('Number of Stocks')
    print(len(stock_symbols))
-    await train_process(stock_symbols, con)
+    #await train_process(stock_symbols, con)
-
+    
    #Prediction Steps for all stock symbols
-    '''
+    
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
    stock_symbols = [row[0] for row in cursor.fetchall()]
    total_symbols = ['GME'] #stock_symbols
    print(f"Total tickers: {len(total_symbols)}")
-    start_date = datetime(2000, 1, 1).strftime("%Y-%m-%d")
+    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")
    chunk_size = len(total_symbols)# // 100  # Divide the list into N chunks
@ -434,7 +420,7 @@ async def run():
            tasks.append(process_symbol(ticker, con, start_date, end_date))
        await asyncio.gather(*tasks)
-    '''
+    
    con.close()
 try:
--- a/app/main.py
+++ b/app/main.py
@ -3950,7 +3950,7 @@ async def get_fomc_impact(api_key: str = Security(get_api_key)):
    compressed_data = gzip.compress(data)
    redis_client.set(cache_key, compressed_data)
-    redis_client.expire(cache_key,3600*3600)
+    redis_client.expire(cache_key,5*60)
    return StreamingResponse(
        io.BytesIO(compressed_data),
--- a/app/ml_models/pycache/fundamental_predictor.cpython-310.pyc
+++ b/app/ml_models/pycache/fundamental_predictor.cpython-310.pyc
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/fundamental_predictor.py
+++ b/app/ml_models/fundamental_predictor.py
@ -7,8 +7,8 @@ from xgboost import XGBClassifier
 from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
-from keras.models import Sequential
+from keras.models import Sequential, Model
-from keras.layers import LSTM, Dense, Conv1D, Dropout, BatchNormalization, MaxPooling1D, Bidirectional
+from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
 from keras.optimizers import Adam
 from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 from keras.models import load_model
@ -31,39 +31,47 @@ import time
 class FundamentalPredictor:
    def __init__(self):
        self.model = self.build_model()
        self.scaler = MinMaxScaler()
        self.model = self.build_model()
    def build_model(self):
        clear_session()
        model = Sequential()
-        model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        # Input layer
-        model.add(Dropout(0.2))
+        inputs = Input(shape=(2139,))
-        model.add(BatchNormalization())
+        
-
+        # First dense layer
-        model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
-        model.add(Dropout(0.2))
+        x = Dropout(0.3)(x)
-        model.add(BatchNormalization())
+        x = BatchNormalization()(x)
-
+        
-        model.add(Dense(3000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        # Additional dense layers
-        model.add(Dropout(0.2))
+        for units in [512,256, 256]:
-        model.add(BatchNormalization())
+            x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
-
+            x = Dropout(0.2)(x)
-        model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+            x = BatchNormalization()(x)
-        model.add(Dropout(0.2))
+        
-        model.add(BatchNormalization())
+        # Reshape for attention mechanism
-
+        x = Reshape((256, 1))(x)
-        model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        
-        model.add(Dropout(0.2))
+        # Attention mechanism
-        model.add(BatchNormalization())
+        attention = Dense(256, activation='relu')(x)
-        model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        attention = Dense(1, activation='softmax')(attention)
-
+        
-        # Output layer for binary classification
+        # Apply attention
-        model.add(Dense(1, activation='sigmoid'))
+        x = Multiply()([x, attention])
-
+        
-        # Optimizer with a lower learning rate and scheduler
+        # Global average pooling
-        optimizer = Adam(learning_rate=0.1)
+        x = GlobalAveragePooling1D()(x)
        # Output layer
        outputs = Dense(1, activation='sigmoid')(x)
        # Create the model
        model = Model(inputs=inputs, outputs=outputs)
        # Optimizer with a lower learning rate
        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
        # Compile the model
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
@ -87,10 +95,10 @@ class FundamentalPredictor:
        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
                                      save_best_only=True, save_freq = 1,
                                      monitor='val_loss', mode='min')
-        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
+        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
-        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)
-        self.model.fit(X_train, y_train, epochs=100_000, batch_size=64, 
+        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
        self.model.save('ml_models/weights/fundamental_weights/weights.keras')
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -0,0 +1,137 @@
 import yfinance as yf
 import pandas as pd
 from datetime import datetime, timedelta
 from sklearn.ensemble import RandomForestClassifier
 import numpy as np
 from xgboost import XGBClassifier
 from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
 from keras.models import Sequential, Model
 from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
 from keras.optimizers import Adam
 from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 from keras.models import load_model
 from sklearn.feature_selection import SelectKBest, f_classif
 from tensorflow.keras.backend import clear_session
 from keras import regularizers
 from keras.layers import Layer
 from tqdm import tqdm
 from collections import defaultdict
 import asyncio
 import aiohttp
 import aiofiles
 import pickle
 import time
 # Based on the paper: https://arxiv.org/pdf/1603.00751
 class ScorePredictor:
    def __init__(self):
        self.scaler = MinMaxScaler()
        self.model = self.build_model()
    def build_model(self):
        clear_session()
        # Input layer
        inputs = Input(shape=(2139,))
        # First dense layer
        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        # Additional dense layers
        for units in [512,256, 256]:
            x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
            x = Dropout(0.2)(x)
            x = BatchNormalization()(x)
        # Reshape for attention mechanism
        x = Reshape((256, 1))(x)
        # Attention mechanism
        attention = Dense(256, activation='relu')(x)
        attention = Dense(1, activation='softmax')(attention)
        # Apply attention
        x = Multiply()([x, attention])
        # Global average pooling
        x = GlobalAveragePooling1D()(x)
        # Output layer
        outputs = Dense(1, activation='sigmoid')(x)
        # Create the model
        model = Model(inputs=inputs, outputs=outputs)
        # Optimizer with a lower learning rate
        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
        # Compile the model
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model
    def preprocess_data(self, X):
        # X = X.applymap(lambda x: 9999 if x == 0 else x)  # Replace 0 with 9999 as suggested in the paper
        X = np.where(np.isinf(X), np.nan, X)
        X = np.nan_to_num(X)
        X = self.scaler.fit_transform(X)
        return X
    def reshape_for_lstm(self, X):
        return X.reshape((X.shape[0], X.shape[1], 1))
    def train_model(self, X_train, y_train):
        X_train = self.preprocess_data(X_train)
        #X_train = self.reshape_for_lstm(X_train)
        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
                                      save_best_only=True, save_freq = 1,
                                      monitor='val_loss', mode='min')
        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)
        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
        self.model.save('ml_models/weights/fundamental_weights/weights.keras')
    def evaluate_model(self, X_test, y_test):
        X_test = self.preprocess_data(X_test)
        X_test = self.reshape_for_lstm(X_test)
        self.model = load_model('ml_models/weights/fundamental_weights/weights.keras')
        test_predictions = self.model.predict(X_test).flatten()
        test_predictions[test_predictions >= 0.5] = 1
        test_predictions[test_predictions < 0.5] = 0
        test_precision = precision_score(y_test, test_predictions)
        test_accuracy = accuracy_score(y_test, test_predictions)
        print("Test Set Metrics:")
        print(f"Precision: {round(test_precision * 100)}%")
        print(f"Accuracy: {round(test_accuracy * 100)}%")
        next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0
        return {'accuracy': round(test_accuracy * 100), 
                'precision': round(test_precision * 100), 
                'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions
    def feature_selection(self, X_train, y_train, k=100):
        print('feature selection:')
        print(X_train.shape, y_train.shape)
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X_train, y_train)
        selector.transform(X_train)
        selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
        return selected_features