diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 565c358..d8007b6 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -80,7 +80,7 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75) selected_features.append(feature) return selected_features -async def download_data(ticker, con, start_date, end_date, skip_downloading): +async def download_data(ticker, con, start_date, end_date, skip_downloading, save_data): file_path = f"ml_models/training_data/ai-score/{ticker}.json" @@ -200,6 +200,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): 'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets', ] + # Function to compute combinations within a group def compute_column_ratios(columns, df, new_columns): @@ -240,7 +241,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x) # Save to a file if there are rows in the DataFrame - if not df_copy.empty: + if not df_copy.empty and save_data == True: with open(file_path, 'wb') as file: file.write(orjson.dumps(df_copy.to_dict(orient='records'))) @@ -251,7 +252,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): pass -async def chunked_gather(tickers, con, skip_downloading, chunk_size): +async def chunked_gather(tickers, con, skip_downloading, save_data, chunk_size): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") @@ -267,7 +268,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): for chunk in tqdm(chunks(tickers, chunk_size)): # Create tasks for each chunk print(f"chunk size: {len(chunk)}") - tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] + tasks = [download_data(ticker, con, start_date, end_date, skip_downloading, save_data) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) @@ -309,18 +310,18 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): print(f'Overall Evaluation Metrics: {data}') -async def warm_start_training(tickers, con, skip_downloading): +async def warm_start_training(tickers, con, skip_downloading, save_data): - dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) + dfs = await chunked_gather(tickers, con, skip_downloading, save_data, chunk_size=100) -async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading): +async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data): try: df_train = pd.DataFrame() df_test_dict = {} # Store test data for each ticker all_test_data = [] # Store all test data for overall evaluation - df = await download_data(ticker, con, start_date, end_date, skip_downloading) + df = await download_data(ticker, con, start_date, end_date, skip_downloading, save_data) split_size = int(len(df) * (1 - test_size)) df_train = df.iloc[:split_size] df_test = df.iloc[split_size:] @@ -345,22 +346,24 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, s # Save the evaluation data to a JSON file await save_json(ticker, data) print(f"Saved results for {ticker}") - except: + except Exception as e: + print(e) pass async def run(): train_mode = False # Set this to False for fine-tuning and evaluation skip_downloading = False + save_data = train_mode con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") if train_mode: # Warm start training - warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) + warm_start_symbols = list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') - await warm_start_training(warm_start_symbols, con, skip_downloading) + await warm_start_training(warm_start_symbols, con, skip_downloading, save_data) else: start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") @@ -374,7 +377,7 @@ async def run(): """) stock_symbols = [row[0] for row in cursor.fetchall()] for ticker in tqdm(stock_symbols): - await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading) + await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data) con.close() diff --git a/app/cron_options_gex.py b/app/cron_options_gex.py index 4383e8f..f44274e 100644 --- a/app/cron_options_gex.py +++ b/app/cron_options_gex.py @@ -367,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal") etf_cursor.execute("SELECT DISTINCT symbol FROM etfs") etf_symbols = [row[0] for row in etf_cursor.fetchall()] -total_symbols = ['SPY'] #stock_symbols + etf_symbols +total_symbols = stock_symbols + etf_symbols query_template = """ SELECT date, close,change_percent diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index f11a41b..4bca0ec 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index d90bc67..50731ce 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -1,14 +1,8 @@ import pandas as pd from datetime import datetime, timedelta import numpy as np -from xgboost import XGBClassifier -from sklearn.ensemble import StackingClassifier, RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.svm import SVC -from sklearn.linear_model import LogisticRegression from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score -from sklearn.preprocessing import MinMaxScaler, StandardScaler +from sklearn.preprocessing import MinMaxScaler from sklearn.decomposition import PCA import lightgbm as lgb @@ -24,52 +18,19 @@ import os class ScorePredictor: def __init__(self): - self.scaler = StandardScaler() + self.scaler = MinMaxScaler() self.pca = PCA(n_components=0.95) # Define base models - self.xgb_model = XGBClassifier( - n_estimators=100, - max_depth=10, - learning_rate=0.001, - random_state=42, - n_jobs=10, - tree_method='gpu_hist', - ) - ''' self.lgb_model = lgb.LGBMClassifier( - n_estimators=100, + n_estimators=2000, learning_rate=0.001, - max_depth=10, + max_depth=5, + num_leaves=2**5-1, n_jobs=10 ) - ''' - self.rf_model = RandomForestClassifier( - n_estimators=100, - max_depth=10, - random_state=42, - n_jobs=10 - ) - - self.svc_model = SVC(probability=True, kernel='rbf') - self.knn_model = KNeighborsClassifier(n_neighbors=5) - self.nb_model = GaussianNB() - # Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner - self.model = StackingClassifier( - estimators=[ - ('xgb', self.xgb_model), - #('lgb', self.lgb_model), - ('rf', self.rf_model), - ('svc', self.svc_model), - ('knn', self.knn_model), - ('nb', self.nb_model) - ], - final_estimator=LogisticRegression(), - n_jobs=10 - ) - self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl' def preprocess_train_data(self, X): @@ -87,8 +48,8 @@ class ScorePredictor: def warm_start_training(self, X_train, y_train): X_train = self.preprocess_train_data(X_train) if os.path.exists(self.warm_start_model_path): - with open(self.warm_start_model_path, 'rb') as f: - self.model = pickle.load(f) + os.remove(self.warm_start_model_path) + self.model.fit(X_train, y_train) pickle.dump(self.model, open(self.warm_start_model_path, 'wb')) print("Warm start model saved.") @@ -121,6 +82,7 @@ class ScorePredictor: print(f"ROC AUC: {round(test_roc_auc_score * 100)}%") last_prediction_prob = class_1_probabilities[-1] + print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions})) print(f"Last prediction probability: {last_prediction_prob}") thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0] diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc index c18cfcc..7bd3eea 100644 Binary files a/app/utils/__pycache__/feature_engineering.cpython-310.pyc and b/app/utils/__pycache__/feature_engineering.cpython-310.pyc differ diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py index e8f20b5..c3eea7a 100644 --- a/app/utils/feature_engineering.py +++ b/app/utils/feature_engineering.py @@ -177,33 +177,6 @@ def generate_statistical_features(df, windows=[20,50,200], price_col='close', df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew() df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt() - # Price-volume correlations - df_features[f'price_volume_corr_{window}'] = ( - df[price_col].rolling(window=window) - .corr(df[volume_col])) - - # Higher-order moments of returns - returns = df[price_col].pct_change() - df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew() - df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt() - - # Cross-sectional statistics - df_features['price_acceleration'] = df[price_col].diff().diff() - df_features['returns_acceleration'] = df[price_col].pct_change().diff() - - # Advanced volatility estimators - df_features['parkinson_vol'] = np.sqrt( - 1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2)) - - df_features['garman_klass_vol'] = np.sqrt( - 0.5 * np.log(df[high_col]/df[low_col])**2 - - (2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2 - ) - - # Dispersion measures - df_features['price_range'] = df[high_col] - df[low_col] - df_features['price_range_pct'] = df_features['price_range'] / df[price_col] - # Clean up any NaN values df_features = df_features.dropna()