diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index c961cb5..7a1f28a 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -119,7 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): #Threshold of enough datapoints needed! if len(ratios) < 50: - print('Not enough data points') + print(f'Not enough data points for {ticker}') return @@ -225,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): # Compute combinations for each group of columns compute_column_ratios(fundamental_columns, df_combined, new_columns) compute_column_ratios(stats_columns, df_combined, new_columns) - #compute_column_ratios(ta_columns, df_combined, new_columns) + compute_column_ratios(ta_columns, df_combined, new_columns) # Concatenate the new ratio columns with the original DataFrame df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1) @@ -272,6 +272,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): chunk_results = await asyncio.gather(*tasks) train_list = [] + test_list = [] for ticker, df in zip(chunk, chunk_results): try: @@ -280,24 +281,19 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): train_data = df.iloc[:split_size] test_data = df.iloc[split_size:] - # Store test data for this ticker in a dictionary - df_test_dict[ticker] = test_data - # Append train data for combined training train_list.append(train_data) - - # Collect all test data for overall evaluation - all_test_data.append(test_data) - + test_list.append(test_data) except: pass # Concatenate all train data together - if train_list: - df_train = pd.concat(train_list, ignore_index=True) + df_train = pd.concat(train_list, ignore_index=True) + df_test = pd.concat(test_list, ignore_index=True) # Shuffle the combined training data df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True) + df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True) print('====== Start Training Model on Combined Data ======') predictor = ScorePredictor() @@ -308,40 +304,50 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): print(f'Training complete on {len(df_train)} samples.') # Evaluate the model on the overall test dataset - if all_test_data: - overall_test_data = pd.concat(all_test_data, ignore_index=True) - print('====== Evaluating on Overall Test Dataset ======') - overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target']) - print(f'Overall Evaluation Metrics: {overall_evaluation_data}') - - # Evaluate the model for each ticker separately - for ticker, test_data in df_test_dict.items(): - try: - print(f"Fine-tuning the model for {ticker}") - predictor.fine_tune_model(df_train[selected_features], df_train['Target']) - - print(f"Evaluating model for {ticker}") - data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) - - # Check if the evaluation data meets the criteria - - if (data['precision'] >= 50 and data['accuracy'] >= 50 and - data['accuracy'] < 100 and data['precision'] < 100 and - data['f1_score'] >= 50 and data['recall_score'] >= 50 and - data['roc_auc_score'] >= 50): - # Save the evaluation data to a JSON file - await save_json(ticker, data) - print(f"Saved results for {ticker}") - except Exception as e: - print(e) - pass - + print('====== Evaluating on Overall Test Dataset ======') + data = predictor.evaluate_model(df_test[selected_features], df_test['Target']) + print(f'Overall Evaluation Metrics: {data}') + async def warm_start_training(tickers, con, skip_downloading): dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) +async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading): + try: + df_train = pd.DataFrame() + df_test_dict = {} # Store test data for each ticker + all_test_data = [] # Store all test data for overall evaluation + + df = await download_data(ticker, con, start_date, end_date, skip_downloading) + split_size = int(len(df) * (1 - test_size)) + df_train = df.iloc[:split_size] + df_test = df.iloc[split_size:] + + # Shuffle the combined training data + df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True) + + print('====== Start Fine-tuning Model ======') + predictor = ScorePredictor() + selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] + + # Train the model on the combined training data + predictor.fine_tune_model(df_train[selected_features], df_train['Target']) + print(f'Training complete on {len(df_train)} samples.') + print(f"Evaluating model for {ticker}") + data = predictor.evaluate_model(df_test[selected_features], df_test['Target']) + print(f'Overall Evaluation Metrics: {data}') + if (data['precision'] >= 50 and data['accuracy'] >= 50 and + data['accuracy'] < 100 and data['precision'] < 100 and + data['f1_score'] >= 50 and data['recall_score'] >= 50 and + data['roc_auc_score'] >= 50): + # Save the evaluation data to a JSON file + await save_json(ticker, data) + print(f"Saved results for {ticker}") + except: + pass + async def run(): train_mode = True # Set this to False for fine-tuning and evaluation skip_downloading = False @@ -351,6 +357,14 @@ async def run(): if train_mode: # Warm start training + warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) + + print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') + await warm_start_training(warm_start_symbols, con, skip_downloading) + else: + start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") + end_date = datetime.today().strftime("%Y-%m-%d") + test_size = 0.2 cursor.execute(""" SELECT DISTINCT symbol FROM stocks @@ -358,11 +372,11 @@ async def run(): AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%' """) - warm_start_symbols = ['PEP'] #[row[0] for row in cursor.fetchall()] + stock_symbols = [row[0] for row in cursor.fetchall()] + for ticker in tqdm(stock_symbols): + await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading) + - print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') - await warm_start_training(warm_start_symbols, con, skip_downloading) - con.close() if __name__ == "__main__": diff --git a/app/cron_options_bubble.py b/app/cron_options_bubble.py index 10bd09c..262b287 100755 --- a/app/cron_options_bubble.py +++ b/app/cron_options_bubble.py @@ -42,11 +42,13 @@ def options_bubble_data(chunk): start_date_str = start_date.strftime('%Y-%m-%d') res_list = [] - for page in range(0, 5000): + page = 0 + while True: try: data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str) data = ujson.loads(fin.output(data))['option_activity'] res_list += data + page +=1 except: break @@ -54,33 +56,39 @@ def options_bubble_data(chunk): for option_type in ['CALL', 'PUT']: for item in res_filtered: - if item['put_call'].upper() == option_type: - item['dte'] = calculate_dte(item['date_expiration']) - if item['ticker'] in ['BRK.A', 'BRK.B']: - item['ticker'] = f"BRK-{item['ticker'][-1]}" + try: + if item['put_call'].upper() == option_type: + item['dte'] = calculate_dte(item['date_expiration']) + if item['ticker'] in ['BRK.A', 'BRK.B']: + item['ticker'] = f"BRK-{item['ticker'][-1]}" + except: + pass #Save raw data for each ticker for options page stack bar chart for ticker in chunk: - ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker] - if len(ticker_filtered_data) != 0: - #sum up calls and puts for each day for the plot - summed_data = {} - for entry in ticker_filtered_data: - volume = int(entry['volume']) - open_interest = int(entry['open_interest']) - put_call = entry['put_call'] - - if entry['date'] not in summed_data: - summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}} - - summed_data[entry['date']][put_call]['volume'] += volume - summed_data[entry['date']][put_call]['open_interest'] += open_interest + try: + ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker] + if len(ticker_filtered_data) != 0: + #sum up calls and puts for each day for the plot + summed_data = {} + for entry in ticker_filtered_data: + volume = int(entry['volume']) + open_interest = int(entry['open_interest']) + put_call = entry['put_call'] + + if entry['date'] not in summed_data: + summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}} + + summed_data[entry['date']][put_call]['volume'] += volume + summed_data[entry['date']][put_call]['open_interest'] += open_interest - result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data] - #reverse the list - result_list = result_list[::-1] - with open(f"json/options-flow/company/{ticker}.json", 'w') as file: - ujson.dump(result_list, file) + result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data] + #reverse the list + result_list = result_list[::-1] + with open(f"json/options-flow/company/{ticker}.json", 'w') as file: + ujson.dump(result_list, file) + except: + pass #Save bubble data for each ticker for overview page for ticker in chunk: @@ -131,7 +139,7 @@ async def main(): chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] - print(chunks) + loop = asyncio.get_running_loop() with ThreadPoolExecutor(max_workers=4) as executor: tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks] diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 5e76303..f11a41b 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/lstm.py b/app/ml_models/lstm.py index 095276e..753dc4e 100755 --- a/app/ml_models/lstm.py +++ b/app/ml_models/lstm.py @@ -23,7 +23,7 @@ class StockPredictor: self.ticker = ticker self.start_date = start_date self.end_date = end_date - self.nth_day = 60 + self.nth_day = 10 self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic') self.horizons = [3,5,10, 15, 20] self.test_size = 0.2 @@ -134,19 +134,19 @@ class StockPredictor: model.add(Dropout(0.2)) model.add(Dense(units=1, activation='sigmoid')) - # Learning rate scheduler - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001) - # Early stopping - early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) - model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy']) + model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) - return model, [reduce_lr, early_stop] + return model def train_model(self, X_train, y_train): - self.model, callbacks = self.build_lstm_model((X_train.shape[1], X_train.shape[2])) - history = self.model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.1, callbacks=callbacks) + # Learning rate scheduler + #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001) + # Early stopping + early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) + self.model = self.build_lstm_model((X_train.shape[1], X_train.shape[2])) + history = self.model.fit(X_train, y_train, epochs=500, batch_size=1024, validation_split=0.1, callbacks=[early_stop]) def evaluate_model(self, X_test, y_test): # Reshape X_test to remove the extra dimension @@ -202,7 +202,7 @@ if __name__ == "__main__": X = df[predictors].values y = df['Target'].values - + print(df) # Normalize features scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X) diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index ba52263..d90bc67 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -2,8 +2,13 @@ import pandas as pd from datetime import datetime, timedelta import numpy as np from xgboost import XGBClassifier +from sklearn.ensemble import StackingClassifier, RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score -from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.decomposition import PCA import lightgbm as lgb @@ -19,97 +24,119 @@ import os class ScorePredictor: def __init__(self): - self.scaler = MinMaxScaler() - self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance - self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' - self.model = lgb.LGBMClassifier( - n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time - learning_rate=0.001, # Smaller learning rate for better generalization - max_depth=6, # Controlled depth to prevent overfitting - num_leaves=2**6-1, # 2^max_depth, prevents overfitting while maintaining model complexity - colsample_bytree=0.1, - n_jobs=10, # Use N CPU cores - verbose=0, # Reduce output noise + self.scaler = StandardScaler() + self.pca = PCA(n_components=0.95) + + # Define base models + self.xgb_model = XGBClassifier( + n_estimators=100, + max_depth=10, + learning_rate=0.001, + random_state=42, + n_jobs=10, + tree_method='gpu_hist', ) ''' - XGBClassifier( - n_estimators=200, - max_depth=5, - learning_rate=0.1, - random_state=42, + self.lgb_model = lgb.LGBMClassifier( + n_estimators=100, + learning_rate=0.001, + max_depth=10, n_jobs=10 ) ''' + self.rf_model = RandomForestClassifier( + n_estimators=100, + max_depth=10, + random_state=42, + n_jobs=10 + ) + + self.svc_model = SVC(probability=True, kernel='rbf') + self.knn_model = KNeighborsClassifier(n_neighbors=5) + self.nb_model = GaussianNB() + + + # Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner + self.model = StackingClassifier( + estimators=[ + ('xgb', self.xgb_model), + #('lgb', self.lgb_model), + ('rf', self.rf_model), + ('svc', self.svc_model), + ('knn', self.knn_model), + ('nb', self.nb_model) + ], + final_estimator=LogisticRegression(), + n_jobs=10 + ) + + self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl' def preprocess_train_data(self, X): - """Preprocess training data by scaling and applying PCA.""" X = np.where(np.isinf(X), np.nan, X) X = np.nan_to_num(X) - X = self.scaler.fit_transform(X) # Transform using the fitted scaler - return self.pca.fit_transform(X) # Fit PCA and transform + X = self.scaler.fit_transform(X) + return self.pca.fit_transform(X) def preprocess_test_data(self, X): - """Preprocess test data by scaling and applying PCA.""" X = np.where(np.isinf(X), np.nan, X) X = np.nan_to_num(X) - X = self.scaler.transform(X) # Transform using the fitted scaler - return self.pca.transform(X) # Transform using the fitted PCA + X = self.scaler.transform(X) + return self.pca.transform(X) def warm_start_training(self, X_train, y_train): X_train = self.preprocess_train_data(X_train) if os.path.exists(self.warm_start_model_path): - with open(f'{self.warm_start_model_path}', 'rb') as f: + with open(self.warm_start_model_path, 'rb') as f: self.model = pickle.load(f) self.model.fit(X_train, y_train) - pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb')) + pickle.dump(self.model, open(self.warm_start_model_path, 'wb')) print("Warm start model saved.") - def fine_tune_model(self, X_train, y_train): X_train = self.preprocess_train_data(X_train) - with open(f'{self.warm_start_model_path}', 'rb') as f: + with open(self.warm_start_model_path, 'rb') as f: self.model = pickle.load(f) self.model.fit(X_train, y_train) print("Model fine-tuned") - def evaluate_model(self, X_test, y_test): X_test = self.preprocess_test_data(X_test) - test_predictions = self.model.predict_proba(X_test) class_1_probabilities = test_predictions[:, 1] binary_predictions = (class_1_probabilities >= 0.5).astype(int) - #print(test_predictions) + + # Calculate and print metrics test_precision = precision_score(y_test, binary_predictions) test_accuracy = accuracy_score(y_test, binary_predictions) test_f1_score = f1_score(y_test, binary_predictions) test_recall_score = recall_score(y_test, binary_predictions) test_roc_auc_score = roc_auc_score(y_test, binary_predictions) - - print("Test Set Metrics:") - print(f"Precision: {round(test_precision * 100)}%") - print(f"Accuracy: {round(test_accuracy * 100)}%") + + print(f"Test Precision: {round(test_precision * 100)}%") + print(f"Test Accuracy: {round(test_accuracy * 100)}%") print(f"F1 Score: {round(test_f1_score * 100)}%") - print(f"Recall Score: {round(test_recall_score * 100)}%") - print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%") - - print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions})) + print(f"Recall: {round(test_recall_score * 100)}%") + print(f"ROC AUC: {round(test_roc_auc_score * 100)}%") + + last_prediction_prob = class_1_probabilities[-1] + print(f"Last prediction probability: {last_prediction_prob}") + thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0] scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - last_prediction_prob = class_1_probabilities[-1] - score = None - print(f"Last prediction probability: {last_prediction_prob}") - + score = None for threshold, value in zip(thresholds, scores): if last_prediction_prob >= threshold: score = value break - return {'accuracy': round(test_accuracy * 100), - 'precision': round(test_precision * 100), - 'f1_score': round(test_f1_score * 100), - 'recall_score': round(test_recall_score * 100), - 'roc_auc_score': round(test_roc_auc_score * 100), - 'score': score} \ No newline at end of file + return { + 'accuracy': round(test_accuracy * 100), + 'precision': round(test_precision * 100), + 'f1_score': round(test_f1_score * 100), + 'recall_score': round(test_recall_score * 100), + 'roc_auc_score': round(test_roc_auc_score * 100), + 'score': score + } \ No newline at end of file diff --git a/app/primary_cron_job.py b/app/primary_cron_job.py index 6a2dcc1..697a294 100755 --- a/app/primary_cron_job.py +++ b/app/primary_cron_job.py @@ -283,10 +283,7 @@ def run_executive(): def run_options_bubble_ticker(): week = datetime.today().weekday() - current_time = datetime.now().time() - start_time = datetime_time(15, 30) - end_time = datetime_time(22, 30) - if week <= 4 and start_time <= current_time < end_time: + if week <= 4: run_command(["python3", "cron_options_bubble.py"]) command = ["sudo", "rsync", "-avz", "-e", "ssh", "/root/backend/app/json/options-bubble", f"root@{useast_ip_address}:/root/backend/app/json"] diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc index 1929f3d..578a0ae 100644 Binary files a/app/utils/__pycache__/feature_engineering.cpython-310.pyc and b/app/utils/__pycache__/feature_engineering.cpython-310.pyc differ diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py index aa38e2a..e8f20b5 100644 --- a/app/utils/feature_engineering.py +++ b/app/utils/feature_engineering.py @@ -94,19 +94,20 @@ def generate_ta_features(df): df_features['aroon_indicator'] = aroon.aroon_indicator() df_features['aroon_up'] = aroon.aroon_up() - df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator() - df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14) + #df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator() + #df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14) df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index() - df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) - df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) + #df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) + #df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) df_features = df_features.dropna() return df_features -def generate_statistical_features(df, windows=[50,200], price_col='close', +def generate_statistical_features(df, windows=[20,50,200], price_col='close', high_col='high', low_col='low', volume_col='volume'): """ Generate comprehensive statistical features for financial time series data. + Focuses purely on statistical measures without technical indicators. Parameters: ----------- @@ -132,7 +133,6 @@ def generate_statistical_features(df, windows=[50,200], price_col='close', # Create a copy of the dataframe to avoid modifying the original df_features = df.copy() - # Calculate features for each window size for window in windows: # Returns @@ -144,11 +144,18 @@ def generate_statistical_features(df, windows=[50,200], price_col='close', df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std() # Statistical moments + df_features[f'mean_{window}'] = df[price_col].rolling(window=window).mean() df_features[f'std_{window}'] = df[price_col].rolling(window=window).std() df_features[f'var_{window}'] = df[price_col].rolling(window=window).var() df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew() df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt() + # Quantile measures + df_features[f'quantile_25_{window}'] = df[price_col].rolling(window=window).quantile(0.25) + df_features[f'quantile_75_{window}'] = df[price_col].rolling(window=window).quantile(0.75) + df_features[f'iqr_{window}'] = ( + df_features[f'quantile_75_{window}'] - df_features[f'quantile_25_{window}']) + # Volatility measures df_features[f'realized_vol_{window}'] = ( df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252)) @@ -156,33 +163,48 @@ def generate_statistical_features(df, windows=[50,200], price_col='close', (df[high_col].rolling(window=window).max() - df[low_col].rolling(window=window).min()) / df[price_col]) - # Z-scores and normalized prices + # Z-scores and normalized values df_features[f'zscore_{window}'] = ( (df[price_col] - df[price_col].rolling(window=window).mean()) / df[price_col].rolling(window=window).std()) - - - # Price dynamics + + # Volume statistics + df_features[f'volume_mean_{window}'] = df[volume_col].rolling(window=window).mean() + df_features[f'volume_std_{window}'] = df[volume_col].rolling(window=window).std() + df_features[f'volume_zscore_{window}'] = ( + (df[volume_col] - df[volume_col].rolling(window=window).mean()) / + df[volume_col].rolling(window=window).std()) + df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew() + df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt() + + # Price-volume correlations + df_features[f'price_volume_corr_{window}'] = ( + df[price_col].rolling(window=window) + .corr(df[volume_col])) + + # Higher-order moments of returns + returns = df[price_col].pct_change() + df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew() + df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt() + + # Cross-sectional statistics df_features['price_acceleration'] = df[price_col].diff().diff() - df_features['momentum_change'] = df[price_col].pct_change().diff() + df_features['returns_acceleration'] = df[price_col].pct_change().diff() - # Advanced volatility + # Advanced volatility estimators df_features['parkinson_vol'] = np.sqrt( 1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2)) - # Efficiency ratio - df_features['price_efficiency'] = ( - abs(df[price_col] - df[price_col].shift(20)) / - (df[high_col].rolling(20).max() - df[low_col].rolling(20).min()) + df_features['garman_klass_vol'] = np.sqrt( + 0.5 * np.log(df[high_col]/df[low_col])**2 - + (2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2 ) - # Deviation metrics - df_features['deviation_from_vwap'] = ( - (df[price_col] - df[price_col].rolling(window=20).mean()) / - df[price_col].rolling(window=20).mean() - ) - - df_features['stock_return'] = df['close'].pct_change() + # Dispersion measures + df_features['price_range'] = df[high_col] - df[low_col] + df_features['price_range_pct'] = df_features['price_range'] / df[price_col] + # Clean up any NaN values df_features = df_features.dropna() - return df_features + + return df_features \ No newline at end of file