diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 41914f2..f039203 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -4,7 +4,7 @@ import aiohttp import aiofiles import sqlite3 from datetime import datetime -from ml_models.fundamental_predictor import FundamentalPredictor +from ml_models.score_model import ScorePredictor import yfinance as yf from collections import defaultdict import pandas as pd @@ -22,7 +22,7 @@ import gc gc.enable() async def save_json(symbol, data): - with open(f"json/fundamental-predictor-analysis/{symbol}.json", 'wb') as file: + with open(f"json/ai-score/{symbol}.json", 'wb') as file: file.write(orjson.dumps(data)) @@ -31,11 +31,6 @@ def trend_intensity(close, window=20): std = close.rolling(window=window).std() return ((close - ma) / std).abs().rolling(window=window).mean() -def fisher_transform(high, low, window=10): - value = (high + low) / 2 - norm_value = (2 * ((value - value.rolling(window=window).min()) / - (value.rolling(window=window).max() - value.rolling(window=window).min())) - 1) - return 0.5 * np.log((1 + norm_value) / (1 - norm_value)) def calculate_fdi(high, low, close, window=30): n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) - @@ -185,8 +180,6 @@ async def download_data(ticker, con, start_date, end_date): df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std() df['fdi'] = calculate_fdi(df['high'], df['low'], df['close']) - #df['hurst'] = df['close'].rolling(window=100).apply(hurst_exponent) - df['fisher'] = fisher_transform(df['high'], df['low']) df['tii'] = trend_intensity(df['close']) @@ -196,7 +189,7 @@ async def download_data(ticker, con, start_date, end_date): 'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover', 'volatility','daily_return','cumulative_return', 'roc','avg_volume_30d', 'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b', - 'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','fisher' + 'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi' ] # Match each combined data entry with the closest available stock price in df @@ -229,7 +222,6 @@ async def download_data(ticker, con, start_date, end_date): combined_data = sorted(combined_data, key=lambda x: x['date']) # Convert combined data into a DataFrame df_combined = pd.DataFrame(combined_data).dropna() - key_elements = [ 'revenue', 'costOfRevenue', @@ -275,37 +267,30 @@ async def download_data(ticker, con, start_date, end_date): 'propertyPlantEquipmentNet', 'ownersEarnings', ] - # Compute ratios for all combinations of key elements - + + new_columns = {} + + # Loop over combinations of column pairs for num, denom in combinations(key_elements, 2): - # Compute ratio num/denom + # Compute ratio and reverse ratio + ratio = df_combined[num] / df_combined[denom] + reverse_ratio = df_combined[denom] / df_combined[num] + + # Define column names for both ratios column_name = f'{num}_to_{denom}' - try: - ratio = df_combined[num] / df_combined[denom] - # Check for valid ratio - df_combined[column_name] = np.where((ratio != 0) & - (ratio != np.inf) & - (ratio != -np.inf) & - (~np.isnan(ratio)), - ratio, 0) - except Exception as e: - print(f"Error calculating {column_name}: {e}") - df_combined[column_name] = 0 - - # Compute reverse ratio denom/num reverse_column_name = f'{denom}_to_{num}' - try: - reverse_ratio = df_combined[denom] / df_combined[num] - # Check for valid reverse ratio - df_combined[reverse_column_name] = np.where((reverse_ratio != 0) & - (reverse_ratio != np.inf) & - (reverse_ratio != -np.inf) & - (~np.isnan(reverse_ratio)), - reverse_ratio, 0) - except Exception as e: - print(f"Error calculating {reverse_column_name}: {e}") - df_combined[reverse_column_name] = 0 + + # Store the new columns in the dictionary, replacing invalid values with 0 + new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0) + new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0) + + # Add all new columns to the original DataFrame at once + df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1) + + # To defragment the DataFrame, make a copy + df_combined = df_combined.copy() + # Create 'Target' column based on price change df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int) @@ -314,7 +299,7 @@ async def download_data(ticker, con, start_date, end_date): df_combined = df_combined.dropna() df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0) df_copy = df_combined.copy() - #print(df_copy[['date','revenue','ownersEarnings','revenuePerShare']]) + return df_copy except Exception as e: @@ -327,14 +312,14 @@ async def process_symbol(ticker, con, start_date, end_date): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - predictor = FundamentalPredictor() + predictor = ScorePredictor() df = await download_data(ticker, con, start_date, end_date) split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] best_features = [col for col in df.columns if col not in ['date','price','Target']] data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target']) - + print(data) ''' output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])] @@ -380,19 +365,19 @@ async def train_process(tickers, con): print('======Train Set Datapoints======') print(len(df_train)) - predictor = FundamentalPredictor() + predictor = ScorePredictor() #print(selected_features) selected_features = [col for col in df_train if col not in ['price','date','Target']] - best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=100) - print(best_features) - predictor.train_model(df_train[best_features], df_train['Target']) + #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=5) + #print(best_features) + predictor.train_model(df_train[selected_features], df_train['Target']) predictor.evaluate_model(df_test[best_features], df_test['Target']) async def test_process(con): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - predictor = FundamentalPredictor() + predictor = ScorePredictor() df = await download_data('GME', con, start_date, end_date) split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] @@ -405,25 +390,26 @@ async def run(): #Train first model con = sqlite3.connect('stocks.db') - + cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] #['AAPL','GME','LLY','NVDA'] # + stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()] + stock_symbols = list(set(stock_symbols)) print('Number of Stocks') print(len(stock_symbols)) - await train_process(stock_symbols, con) - + #await train_process(stock_symbols, con) + #Prediction Steps for all stock symbols - ''' + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") stock_symbols = [row[0] for row in cursor.fetchall()] total_symbols = ['GME'] #stock_symbols print(f"Total tickers: {len(total_symbols)}") - start_date = datetime(2000, 1, 1).strftime("%Y-%m-%d") + start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") chunk_size = len(total_symbols)# // 100 # Divide the list into N chunks @@ -434,7 +420,7 @@ async def run(): tasks.append(process_symbol(ticker, con, start_date, end_date)) await asyncio.gather(*tasks) - ''' + con.close() try: diff --git a/app/main.py b/app/main.py index 3a81efd..952050e 100755 --- a/app/main.py +++ b/app/main.py @@ -3950,7 +3950,7 @@ async def get_fomc_impact(api_key: str = Security(get_api_key)): compressed_data = gzip.compress(data) redis_client.set(cache_key, compressed_data) - redis_client.expire(cache_key,3600*3600) + redis_client.expire(cache_key,5*60) return StreamingResponse( io.BytesIO(compressed_data), diff --git a/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc b/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc index 64451da..ab98be1 100644 Binary files a/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc and b/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc differ diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc new file mode 100644 index 0000000..0525535 Binary files /dev/null and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/fundamental_predictor.py b/app/ml_models/fundamental_predictor.py index 671b394..3bb6db0 100755 --- a/app/ml_models/fundamental_predictor.py +++ b/app/ml_models/fundamental_predictor.py @@ -7,8 +7,8 @@ from xgboost import XGBClassifier from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, StandardScaler -from keras.models import Sequential -from keras.layers import LSTM, Dense, Conv1D, Dropout, BatchNormalization, MaxPooling1D, Bidirectional +from keras.models import Sequential, Model +from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional from keras.optimizers import Adam from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau from keras.models import load_model @@ -31,39 +31,47 @@ import time class FundamentalPredictor: def __init__(self): - self.model = self.build_model() self.scaler = MinMaxScaler() + self.model = self.build_model() def build_model(self): clear_session() - model = Sequential() - model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - model.add(Dropout(0.2)) - model.add(BatchNormalization()) - - model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - model.add(Dropout(0.2)) - model.add(BatchNormalization()) - - model.add(Dense(3000, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - model.add(Dropout(0.2)) - model.add(BatchNormalization()) - - model.add(Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - model.add(Dropout(0.2)) - model.add(BatchNormalization()) - - model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - model.add(Dropout(0.2)) - model.add(BatchNormalization()) - model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01))) - - # Output layer for binary classification - model.add(Dense(1, activation='sigmoid')) - - # Optimizer with a lower learning rate and scheduler - optimizer = Adam(learning_rate=0.1) + # Input layer + inputs = Input(shape=(2139,)) + + # First dense layer + x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) + x = Dropout(0.3)(x) + x = BatchNormalization()(x) + + # Additional dense layers + for units in [512,256, 256]: + x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x) + x = Dropout(0.2)(x) + x = BatchNormalization()(x) + + # Reshape for attention mechanism + x = Reshape((256, 1))(x) + + # Attention mechanism + attention = Dense(256, activation='relu')(x) + attention = Dense(1, activation='softmax')(attention) + + # Apply attention + x = Multiply()([x, attention]) + + # Global average pooling + x = GlobalAveragePooling1D()(x) + + # Output layer + outputs = Dense(1, activation='sigmoid')(x) + + # Create the model + model = Model(inputs=inputs, outputs=outputs) + + # Optimizer with a lower learning rate + optimizer = Adam(learning_rate=0.1, clipnorm = 1.0) # Compile the model model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) @@ -87,10 +95,10 @@ class FundamentalPredictor: checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', save_best_only=True, save_freq = 1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001) + early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001) - self.model.fit(X_train, y_train, epochs=100_000, batch_size=64, + self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) self.model.save('ml_models/weights/fundamental_weights/weights.keras') diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py new file mode 100644 index 0000000..ced08bb --- /dev/null +++ b/app/ml_models/score_model.py @@ -0,0 +1,137 @@ +import yfinance as yf +import pandas as pd +from datetime import datetime, timedelta +from sklearn.ensemble import RandomForestClassifier +import numpy as np +from xgboost import XGBClassifier +from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler, StandardScaler +from keras.models import Sequential, Model +from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Conv1D, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional +from keras.optimizers import Adam +from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau +from keras.models import load_model +from sklearn.feature_selection import SelectKBest, f_classif +from tensorflow.keras.backend import clear_session +from keras import regularizers +from keras.layers import Layer + + +from tqdm import tqdm +from collections import defaultdict +import asyncio +import aiohttp +import aiofiles +import pickle +import time + +# Based on the paper: https://arxiv.org/pdf/1603.00751 + + +class ScorePredictor: + def __init__(self): + self.scaler = MinMaxScaler() + self.model = self.build_model() + + def build_model(self): + clear_session() + + # Input layer + inputs = Input(shape=(2139,)) + + # First dense layer + x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) + x = Dropout(0.3)(x) + x = BatchNormalization()(x) + + # Additional dense layers + for units in [512,256, 256]: + x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x) + x = Dropout(0.2)(x) + x = BatchNormalization()(x) + + # Reshape for attention mechanism + x = Reshape((256, 1))(x) + + # Attention mechanism + attention = Dense(256, activation='relu')(x) + attention = Dense(1, activation='softmax')(attention) + + # Apply attention + x = Multiply()([x, attention]) + + # Global average pooling + x = GlobalAveragePooling1D()(x) + + # Output layer + outputs = Dense(1, activation='sigmoid')(x) + + # Create the model + model = Model(inputs=inputs, outputs=outputs) + + # Optimizer with a lower learning rate + optimizer = Adam(learning_rate=0.1, clipnorm = 1.0) + + # Compile the model + model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) + + return model + + def preprocess_data(self, X): + # X = X.applymap(lambda x: 9999 if x == 0 else x) # Replace 0 with 9999 as suggested in the paper + X = np.where(np.isinf(X), np.nan, X) + X = np.nan_to_num(X) + X = self.scaler.fit_transform(X) + return X + + def reshape_for_lstm(self, X): + return X.reshape((X.shape[0], X.shape[1], 1)) + + def train_model(self, X_train, y_train): + X_train = self.preprocess_data(X_train) + #X_train = self.reshape_for_lstm(X_train) + + checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', + save_best_only=True, save_freq = 1, + monitor='val_loss', mode='min') + early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001) + + self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, + validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) + self.model.save('ml_models/weights/fundamental_weights/weights.keras') + + def evaluate_model(self, X_test, y_test): + X_test = self.preprocess_data(X_test) + X_test = self.reshape_for_lstm(X_test) + + self.model = load_model('ml_models/weights/fundamental_weights/weights.keras') + + test_predictions = self.model.predict(X_test).flatten() + + test_predictions[test_predictions >= 0.5] = 1 + test_predictions[test_predictions < 0.5] = 0 + + test_precision = precision_score(y_test, test_predictions) + test_accuracy = accuracy_score(y_test, test_predictions) + + print("Test Set Metrics:") + print(f"Precision: {round(test_precision * 100)}%") + print(f"Accuracy: {round(test_accuracy * 100)}%") + + next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0 + return {'accuracy': round(test_accuracy * 100), + 'precision': round(test_precision * 100), + 'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions + + def feature_selection(self, X_train, y_train, k=100): + print('feature selection:') + print(X_train.shape, y_train.shape) + selector = SelectKBest(score_func=f_classif, k=k) + selector.fit(X_train, y_train) + + selector.transform(X_train) + selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]] + + return selected_features