update ai model

2024-09-30 11:52:07 +02:00 · 2024-09-30 11:52:07 +02:00 · 3b70c93d28
commit 3b70c93d28
parent 96c96254fc
6 changed files with 218 additions and 87 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -4,7 +4,7 @@ import aiohttp
 import aiofiles
 import sqlite3
 from datetime import datetime
-from ml_models.fundamental_predictor import FundamentalPredictor
+from ml_models.score_model import ScorePredictor
 import yfinance as yf
 from collections import defaultdict
 import pandas as pd
@ -22,7 +22,7 @@ import gc
 gc.enable()

 async def save_json(symbol, data):
-    with open(f"json/fundamental-predictor-analysis/{symbol}.json", 'wb') as file:
+    with open(f"json/ai-score/{symbol}.json", 'wb') as file:
        file.write(orjson.dumps(data))


@ -31,11 +31,6 @@ def trend_intensity(close, window=20):
    std = close.rolling(window=window).std()
    return ((close - ma) / std).abs().rolling(window=window).mean()

-def fisher_transform(high, low, window=10):
-    value = (high + low) / 2
-    norm_value = (2 * ((value - value.rolling(window=window).min()) / 
-                       (value.rolling(window=window).max() - value.rolling(window=window).min())) - 1)
-    return 0.5 * np.log((1 + norm_value) / (1 - norm_value))

 def calculate_fdi(high, low, close, window=30):
    n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
@ -185,8 +180,6 @@ async def download_data(ticker, con, start_date, end_date):
        df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()

        df['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
-        #df['hurst'] = df['close'].rolling(window=100).apply(hurst_exponent)
-        df['fisher'] = fisher_transform(df['high'], df['low'])
        df['tii'] = trend_intensity(df['close'])


@ -196,7 +189,7 @@ async def download_data(ticker, con, start_date, end_date):
            'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover',
            'volatility','daily_return','cumulative_return', 'roc','avg_volume_30d',
            'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b',
-            'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','fisher'
+            'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi'
        ]

        # Match each combined data entry with the closest available stock price in df
@ -229,7 +222,6 @@ async def download_data(ticker, con, start_date, end_date):
        combined_data = sorted(combined_data, key=lambda x: x['date'])
        # Convert combined data into a DataFrame
        df_combined = pd.DataFrame(combined_data).dropna()
-
        key_elements = [
            'revenue',
            'costOfRevenue',
@ -275,37 +267,30 @@ async def download_data(ticker, con, start_date, end_date):
            'propertyPlantEquipmentNet',
            'ownersEarnings',
        ]
-
        # Compute ratios for all combinations of key elements

-        for num, denom in combinations(key_elements, 2):
-            # Compute ratio num/denom
-            column_name = f'{num}_to_{denom}'
-            try:
-                ratio = df_combined[num] / df_combined[denom]
-                # Check for valid ratio
-                df_combined[column_name] = np.where((ratio != 0) & 
-                                                     (ratio != np.inf) & 
-                                                     (ratio != -np.inf) & 
-                                                     (~np.isnan(ratio)), 
-                                                     ratio, 0)
-            except Exception as e:
-                print(f"Error calculating {column_name}: {e}")
-                df_combined[column_name] = 0
+        new_columns = {}

-            # Compute reverse ratio denom/num
+        # Loop over combinations of column pairs
+        for num, denom in combinations(key_elements, 2):
+            # Compute ratio and reverse ratio
+            ratio = df_combined[num] / df_combined[denom]
+            reverse_ratio = df_combined[denom] / df_combined[num]
+
+            # Define column names for both ratios
+            column_name = f'{num}_to_{denom}'
            reverse_column_name = f'{denom}_to_{num}'
-            try:
-                reverse_ratio = df_combined[denom] / df_combined[num]
-                # Check for valid reverse ratio
-                df_combined[reverse_column_name] = np.where((reverse_ratio != 0) & 
-                                                             (reverse_ratio != np.inf) & 
-                                                             (reverse_ratio != -np.inf) & 
-                                                             (~np.isnan(reverse_ratio)), 
-                                                             reverse_ratio, 0)
-            except Exception as e:
-                print(f"Error calculating {reverse_column_name}: {e}")
-                df_combined[reverse_column_name] = 0
+
+            # Store the new columns in the dictionary, replacing invalid values with 0
+            new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
+            new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
+
+        # Add all new columns to the original DataFrame at once
+        df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
+
+        # To defragment the DataFrame, make a copy
+        df_combined = df_combined.copy()
+
        
        # Create 'Target' column based on price change
        df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
@ -314,7 +299,7 @@ async def download_data(ticker, con, start_date, end_date):
        df_combined = df_combined.dropna()
        df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
        df_copy = df_combined.copy()
-        #print(df_copy[['date','revenue','ownersEarnings','revenuePerShare']])
+
        return df_copy

    except Exception as e:
@ -327,14 +312,14 @@ async def process_symbol(ticker, con, start_date, end_date):
        test_size = 0.2
        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
        end_date = datetime.today().strftime("%Y-%m-%d")
-        predictor = FundamentalPredictor()
+        predictor = ScorePredictor()
        df = await download_data(ticker, con, start_date, end_date)
        split_size = int(len(df) * (1-test_size))
        test_data = df.iloc[split_size:]
        best_features = [col for col in df.columns if col not in ['date','price','Target']]
        data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target'])
        
-
+        print(data)
        '''
        output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} 
                                for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])]
@ -380,19 +365,19 @@ async def train_process(tickers, con):
    print('======Train Set Datapoints======')
    print(len(df_train))

-    predictor = FundamentalPredictor()
+    predictor = ScorePredictor()
    #print(selected_features)
    selected_features = [col for col in df_train if col not in ['price','date','Target']]
-    best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=100)
-    print(best_features)
-    predictor.train_model(df_train[best_features], df_train['Target'])
+    #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=5)
+    #print(best_features)
+    predictor.train_model(df_train[selected_features], df_train['Target'])
    predictor.evaluate_model(df_test[best_features], df_test['Target'])

 async def test_process(con):
    test_size = 0.2
    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")
-    predictor = FundamentalPredictor()
+    predictor = ScorePredictor()
    df = await download_data('GME', con, start_date, end_date)
    split_size = int(len(df) * (1-test_size))
    test_data = df.iloc[split_size:]
@ -409,21 +394,22 @@ async def run():
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'")
-    stock_symbols = [row[0] for row in cursor.fetchall()] #['AAPL','GME','LLY','NVDA'] #
+    stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()]
+    stock_symbols = list(set(stock_symbols))
    print('Number of Stocks')
    print(len(stock_symbols))
-    await train_process(stock_symbols, con)
+    #await train_process(stock_symbols, con)
    

    #Prediction Steps for all stock symbols
-    '''
+    
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9")
    stock_symbols = [row[0] for row in cursor.fetchall()]

    total_symbols = ['GME'] #stock_symbols
    
    print(f"Total tickers: {len(total_symbols)}")
-    start_date = datetime(2000, 1, 1).strftime("%Y-%m-%d")
+    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")

    chunk_size = len(total_symbols)# // 100  # Divide the list into N chunks
@ -434,7 +420,7 @@ async def run():
            tasks.append(process_symbol(ticker, con, start_date, end_date))

        await asyncio.gather(*tasks)
-    '''
+    
    con.close()
    
 try:
--- a/app/main.py
+++ b/app/main.py
@ -3950,7 +3950,7 @@ async def get_fomc_impact(api_key: str = Security(get_api_key)):
    compressed_data = gzip.compress(data)

    redis_client.set(cache_key, compressed_data)
-    redis_client.expire(cache_key,3600*3600)
+    redis_client.expire(cache_key,5*60)

    return StreamingResponse(
        io.BytesIO(compressed_data),
--- a/app/ml_models/pycache/fundamental_predictor.cpython-310.pyc
+++ b/app/ml_models/pycache/fundamental_predictor.cpython-310.pyc
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/fundamental_predictor.py
+++ b/app/ml_models/fundamental_predictor.py
@ -7,8 +7,8 @@ from xgboost import XGBClassifier
 from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
-from keras.models import Sequential
-from keras.layers import LSTM, Dense, Conv1D, Dropout, BatchNormalization, MaxPooling1D, Bidirectional
+from keras.models import Sequential, Model
+from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
 from keras.optimizers import Adam
 from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 from keras.models import load_model
@ -31,39 +31,47 @@ import time

 class FundamentalPredictor:
    def __init__(self):
-        self.model = self.build_model()
        self.scaler = MinMaxScaler()
+        self.model = self.build_model()

    def build_model(self):
        clear_session()
-        model = Sequential()
        
-        model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
-        model.add(Dropout(0.2))
-        model.add(BatchNormalization())
+        # Input layer
+        inputs = Input(shape=(2139,))
        
-        model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
-        model.add(Dropout(0.2))
-        model.add(BatchNormalization())
+        # First dense layer
+        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
+        x = Dropout(0.3)(x)
+        x = BatchNormalization()(x)
        
-        model.add(Dense(3000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
-        model.add(Dropout(0.2))
-        model.add(BatchNormalization())
+        # Additional dense layers
+        for units in [512,256, 256]:
+            x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
+            x = Dropout(0.2)(x)
+            x = BatchNormalization()(x)
        
-        model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
-        model.add(Dropout(0.2))
-        model.add(BatchNormalization())
+        # Reshape for attention mechanism
+        x = Reshape((256, 1))(x)
        
-        model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
-        model.add(Dropout(0.2))
-        model.add(BatchNormalization())
-        model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
+        # Attention mechanism
+        attention = Dense(256, activation='relu')(x)
+        attention = Dense(1, activation='softmax')(attention)
        
-        # Output layer for binary classification
-        model.add(Dense(1, activation='sigmoid'))
+        # Apply attention
+        x = Multiply()([x, attention])
        
-        # Optimizer with a lower learning rate and scheduler
-        optimizer = Adam(learning_rate=0.1)
+        # Global average pooling
+        x = GlobalAveragePooling1D()(x)
+        
+        # Output layer
+        outputs = Dense(1, activation='sigmoid')(x)
+        
+        # Create the model
+        model = Model(inputs=inputs, outputs=outputs)
+        
+        # Optimizer with a lower learning rate
+        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
        
        # Compile the model
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
@ -87,10 +95,10 @@ class FundamentalPredictor:
        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
                                      save_best_only=True, save_freq = 1,
                                      monitor='val_loss', mode='min')
-        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
-        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001)
+        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)

-        self.model.fit(X_train, y_train, epochs=100_000, batch_size=64, 
+        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
        self.model.save('ml_models/weights/fundamental_weights/weights.keras')

--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -0,0 +1,137 @@
+import yfinance as yf
+import pandas as pd
+from datetime import datetime, timedelta
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+from xgboost import XGBClassifier
+from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from keras.models import Sequential, Model
+from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
+from keras.optimizers import Adam
+from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
+from keras.models import load_model
+from sklearn.feature_selection import SelectKBest, f_classif
+from tensorflow.keras.backend import clear_session
+from keras import regularizers
+from keras.layers import Layer
+
+
+from tqdm import tqdm
+from collections import defaultdict
+import asyncio
+import aiohttp
+import aiofiles
+import pickle
+import time
+
+# Based on the paper: https://arxiv.org/pdf/1603.00751
+
+
+class ScorePredictor:
+    def __init__(self):
+        self.scaler = MinMaxScaler()
+        self.model = self.build_model()
+
+    def build_model(self):
+        clear_session()
+        
+        # Input layer
+        inputs = Input(shape=(2139,))
+        
+        # First dense layer
+        x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
+        x = Dropout(0.3)(x)
+        x = BatchNormalization()(x)
+        
+        # Additional dense layers
+        for units in [512,256, 256]:
+            x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
+            x = Dropout(0.2)(x)
+            x = BatchNormalization()(x)
+        
+        # Reshape for attention mechanism
+        x = Reshape((256, 1))(x)
+        
+        # Attention mechanism
+        attention = Dense(256, activation='relu')(x)
+        attention = Dense(1, activation='softmax')(attention)
+        
+        # Apply attention
+        x = Multiply()([x, attention])
+        
+        # Global average pooling
+        x = GlobalAveragePooling1D()(x)
+        
+        # Output layer
+        outputs = Dense(1, activation='sigmoid')(x)
+        
+        # Create the model
+        model = Model(inputs=inputs, outputs=outputs)
+        
+        # Optimizer with a lower learning rate
+        optimizer = Adam(learning_rate=0.1, clipnorm = 1.0)
+        
+        # Compile the model
+        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
+        
+        return model
+
+    def preprocess_data(self, X):
+        # X = X.applymap(lambda x: 9999 if x == 0 else x)  # Replace 0 with 9999 as suggested in the paper
+        X = np.where(np.isinf(X), np.nan, X)
+        X = np.nan_to_num(X)
+        X = self.scaler.fit_transform(X)
+        return X
+
+    def reshape_for_lstm(self, X):
+        return X.reshape((X.shape[0], X.shape[1], 1))
+
+    def train_model(self, X_train, y_train):
+        X_train = self.preprocess_data(X_train)
+        #X_train = self.reshape_for_lstm(X_train)
+        
+        checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', 
+                                      save_best_only=True, save_freq = 1,
+                                      monitor='val_loss', mode='min')
+        early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001)
+
+        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
+                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
+        self.model.save('ml_models/weights/fundamental_weights/weights.keras')
+
+    def evaluate_model(self, X_test, y_test):
+        X_test = self.preprocess_data(X_test)
+        X_test = self.reshape_for_lstm(X_test)
+        
+        self.model = load_model('ml_models/weights/fundamental_weights/weights.keras')
+        
+        test_predictions = self.model.predict(X_test).flatten()
+        
+        test_predictions[test_predictions >= 0.5] = 1
+        test_predictions[test_predictions < 0.5] = 0
+        
+        test_precision = precision_score(y_test, test_predictions)
+        test_accuracy = accuracy_score(y_test, test_predictions)
+        
+        print("Test Set Metrics:")
+        print(f"Precision: {round(test_precision * 100)}%")
+        print(f"Accuracy: {round(test_accuracy * 100)}%")
+        
+        next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0
+        return {'accuracy': round(test_accuracy * 100), 
+                'precision': round(test_precision * 100), 
+                'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions
+
+    def feature_selection(self, X_train, y_train, k=100):
+        print('feature selection:')
+        print(X_train.shape, y_train.shape)
+        selector = SelectKBest(score_func=f_classif, k=k)
+        selector.fit(X_train, y_train)
+
+        selector.transform(X_train)
+        selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
+
+        return selected_features