bugfixing options bubble data

This commit is contained in:
MuslemRahimi 2024-10-07 15:13:33 +02:00
parent 1ee1e10e72
commit 0513fced3d
8 changed files with 222 additions and 154 deletions

View File

@ -119,7 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
#Threshold of enough datapoints needed! #Threshold of enough datapoints needed!
if len(ratios) < 50: if len(ratios) < 50:
print('Not enough data points') print(f'Not enough data points for {ticker}')
return return
@ -225,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
# Compute combinations for each group of columns # Compute combinations for each group of columns
compute_column_ratios(fundamental_columns, df_combined, new_columns) compute_column_ratios(fundamental_columns, df_combined, new_columns)
compute_column_ratios(stats_columns, df_combined, new_columns) compute_column_ratios(stats_columns, df_combined, new_columns)
#compute_column_ratios(ta_columns, df_combined, new_columns) compute_column_ratios(ta_columns, df_combined, new_columns)
# Concatenate the new ratio columns with the original DataFrame # Concatenate the new ratio columns with the original DataFrame
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1) df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
@ -272,6 +272,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
chunk_results = await asyncio.gather(*tasks) chunk_results = await asyncio.gather(*tasks)
train_list = [] train_list = []
test_list = []
for ticker, df in zip(chunk, chunk_results): for ticker, df in zip(chunk, chunk_results):
try: try:
@ -280,24 +281,19 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
train_data = df.iloc[:split_size] train_data = df.iloc[:split_size]
test_data = df.iloc[split_size:] test_data = df.iloc[split_size:]
# Store test data for this ticker in a dictionary
df_test_dict[ticker] = test_data
# Append train data for combined training # Append train data for combined training
train_list.append(train_data) train_list.append(train_data)
test_list.append(test_data)
# Collect all test data for overall evaluation
all_test_data.append(test_data)
except: except:
pass pass
# Concatenate all train data together # Concatenate all train data together
if train_list: df_train = pd.concat(train_list, ignore_index=True)
df_train = pd.concat(train_list, ignore_index=True) df_test = pd.concat(test_list, ignore_index=True)
# Shuffle the combined training data # Shuffle the combined training data
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True) df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
print('====== Start Training Model on Combined Data ======') print('====== Start Training Model on Combined Data ======')
predictor = ScorePredictor() predictor = ScorePredictor()
@ -308,33 +304,9 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
print(f'Training complete on {len(df_train)} samples.') print(f'Training complete on {len(df_train)} samples.')
# Evaluate the model on the overall test dataset # Evaluate the model on the overall test dataset
if all_test_data: print('====== Evaluating on Overall Test Dataset ======')
overall_test_data = pd.concat(all_test_data, ignore_index=True) data = predictor.evaluate_model(df_test[selected_features], df_test['Target'])
print('====== Evaluating on Overall Test Dataset ======') print(f'Overall Evaluation Metrics: {data}')
overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
# Evaluate the model for each ticker separately
for ticker, test_data in df_test_dict.items():
try:
print(f"Fine-tuning the model for {ticker}")
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
print(f"Evaluating model for {ticker}")
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
# Check if the evaluation data meets the criteria
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
data['accuracy'] < 100 and data['precision'] < 100 and
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
data['roc_auc_score'] >= 50):
# Save the evaluation data to a JSON file
await save_json(ticker, data)
print(f"Saved results for {ticker}")
except Exception as e:
print(e)
pass
async def warm_start_training(tickers, con, skip_downloading): async def warm_start_training(tickers, con, skip_downloading):
@ -342,6 +314,40 @@ async def warm_start_training(tickers, con, skip_downloading):
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading):
try:
df_train = pd.DataFrame()
df_test_dict = {} # Store test data for each ticker
all_test_data = [] # Store all test data for overall evaluation
df = await download_data(ticker, con, start_date, end_date, skip_downloading)
split_size = int(len(df) * (1 - test_size))
df_train = df.iloc[:split_size]
df_test = df.iloc[split_size:]
# Shuffle the combined training data
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
print('====== Start Fine-tuning Model ======')
predictor = ScorePredictor()
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
# Train the model on the combined training data
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
print(f'Training complete on {len(df_train)} samples.')
print(f"Evaluating model for {ticker}")
data = predictor.evaluate_model(df_test[selected_features], df_test['Target'])
print(f'Overall Evaluation Metrics: {data}')
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
data['accuracy'] < 100 and data['precision'] < 100 and
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
data['roc_auc_score'] >= 50):
# Save the evaluation data to a JSON file
await save_json(ticker, data)
print(f"Saved results for {ticker}")
except:
pass
async def run(): async def run():
train_mode = True # Set this to False for fine-tuning and evaluation train_mode = True # Set this to False for fine-tuning and evaluation
skip_downloading = False skip_downloading = False
@ -351,6 +357,14 @@ async def run():
if train_mode: if train_mode:
# Warm start training # Warm start training
warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
await warm_start_training(warm_start_symbols, con, skip_downloading)
else:
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d")
test_size = 0.2
cursor.execute(""" cursor.execute("""
SELECT DISTINCT symbol SELECT DISTINCT symbol
FROM stocks FROM stocks
@ -358,10 +372,10 @@ async def run():
AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%.%'
AND symbol NOT LIKE '%-%' AND symbol NOT LIKE '%-%'
""") """)
warm_start_symbols = ['PEP'] #[row[0] for row in cursor.fetchall()] stock_symbols = [row[0] for row in cursor.fetchall()]
for ticker in tqdm(stock_symbols):
await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading)
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
await warm_start_training(warm_start_symbols, con, skip_downloading)
con.close() con.close()

View File

@ -42,11 +42,13 @@ def options_bubble_data(chunk):
start_date_str = start_date.strftime('%Y-%m-%d') start_date_str = start_date.strftime('%Y-%m-%d')
res_list = [] res_list = []
for page in range(0, 5000): page = 0
while True:
try: try:
data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str) data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str)
data = ujson.loads(fin.output(data))['option_activity'] data = ujson.loads(fin.output(data))['option_activity']
res_list += data res_list += data
page +=1
except: except:
break break
@ -54,33 +56,39 @@ def options_bubble_data(chunk):
for option_type in ['CALL', 'PUT']: for option_type in ['CALL', 'PUT']:
for item in res_filtered: for item in res_filtered:
if item['put_call'].upper() == option_type: try:
item['dte'] = calculate_dte(item['date_expiration']) if item['put_call'].upper() == option_type:
if item['ticker'] in ['BRK.A', 'BRK.B']: item['dte'] = calculate_dte(item['date_expiration'])
item['ticker'] = f"BRK-{item['ticker'][-1]}" if item['ticker'] in ['BRK.A', 'BRK.B']:
item['ticker'] = f"BRK-{item['ticker'][-1]}"
except:
pass
#Save raw data for each ticker for options page stack bar chart #Save raw data for each ticker for options page stack bar chart
for ticker in chunk: for ticker in chunk:
ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker] try:
if len(ticker_filtered_data) != 0: ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker]
#sum up calls and puts for each day for the plot if len(ticker_filtered_data) != 0:
summed_data = {} #sum up calls and puts for each day for the plot
for entry in ticker_filtered_data: summed_data = {}
volume = int(entry['volume']) for entry in ticker_filtered_data:
open_interest = int(entry['open_interest']) volume = int(entry['volume'])
put_call = entry['put_call'] open_interest = int(entry['open_interest'])
put_call = entry['put_call']
if entry['date'] not in summed_data: if entry['date'] not in summed_data:
summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}} summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}}
summed_data[entry['date']][put_call]['volume'] += volume summed_data[entry['date']][put_call]['volume'] += volume
summed_data[entry['date']][put_call]['open_interest'] += open_interest summed_data[entry['date']][put_call]['open_interest'] += open_interest
result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data] result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data]
#reverse the list #reverse the list
result_list = result_list[::-1] result_list = result_list[::-1]
with open(f"json/options-flow/company/{ticker}.json", 'w') as file: with open(f"json/options-flow/company/{ticker}.json", 'w') as file:
ujson.dump(result_list, file) ujson.dump(result_list, file)
except:
pass
#Save bubble data for each ticker for overview page #Save bubble data for each ticker for overview page
for ticker in chunk: for ticker in chunk:
@ -131,7 +139,7 @@ async def main():
chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
print(chunks)
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
with ThreadPoolExecutor(max_workers=4) as executor: with ThreadPoolExecutor(max_workers=4) as executor:
tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks] tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks]

View File

@ -23,7 +23,7 @@ class StockPredictor:
self.ticker = ticker self.ticker = ticker
self.start_date = start_date self.start_date = start_date
self.end_date = end_date self.end_date = end_date
self.nth_day = 60 self.nth_day = 10
self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic') self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic')
self.horizons = [3,5,10, 15, 20] self.horizons = [3,5,10, 15, 20]
self.test_size = 0.2 self.test_size = 0.2
@ -134,19 +134,19 @@ class StockPredictor:
model.add(Dropout(0.2)) model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid')) model.add(Dense(units=1, activation='sigmoid'))
# Learning rate scheduler
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model, [reduce_lr, early_stop] return model
def train_model(self, X_train, y_train): def train_model(self, X_train, y_train):
self.model, callbacks = self.build_lstm_model((X_train.shape[1], X_train.shape[2])) # Learning rate scheduler
history = self.model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.1, callbacks=callbacks) #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
self.model = self.build_lstm_model((X_train.shape[1], X_train.shape[2]))
history = self.model.fit(X_train, y_train, epochs=500, batch_size=1024, validation_split=0.1, callbacks=[early_stop])
def evaluate_model(self, X_test, y_test): def evaluate_model(self, X_test, y_test):
# Reshape X_test to remove the extra dimension # Reshape X_test to remove the extra dimension
@ -202,7 +202,7 @@ if __name__ == "__main__":
X = df[predictors].values X = df[predictors].values
y = df['Target'].values y = df['Target'].values
print(df)
# Normalize features # Normalize features
scaler = MinMaxScaler(feature_range=(0, 1)) scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X) X = scaler.fit_transform(X)

View File

@ -2,8 +2,13 @@ import pandas as pd
from datetime import datetime, timedelta from datetime import datetime, timedelta
import numpy as np import numpy as np
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
import lightgbm as lgb import lightgbm as lgb
@ -19,97 +24,119 @@ import os
class ScorePredictor: class ScorePredictor:
def __init__(self): def __init__(self):
self.scaler = MinMaxScaler() self.scaler = StandardScaler()
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance self.pca = PCA(n_components=0.95)
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
self.model = lgb.LGBMClassifier( # Define base models
n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time self.xgb_model = XGBClassifier(
learning_rate=0.001, # Smaller learning rate for better generalization n_estimators=100,
max_depth=6, # Controlled depth to prevent overfitting max_depth=10,
num_leaves=2**6-1, # 2^max_depth, prevents overfitting while maintaining model complexity learning_rate=0.001,
colsample_bytree=0.1, random_state=42,
n_jobs=10, # Use N CPU cores n_jobs=10,
verbose=0, # Reduce output noise tree_method='gpu_hist',
) )
''' '''
XGBClassifier( self.lgb_model = lgb.LGBMClassifier(
n_estimators=200, n_estimators=100,
max_depth=5, learning_rate=0.001,
learning_rate=0.1, max_depth=10,
random_state=42,
n_jobs=10 n_jobs=10
) )
''' '''
self.rf_model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42,
n_jobs=10
)
self.svc_model = SVC(probability=True, kernel='rbf')
self.knn_model = KNeighborsClassifier(n_neighbors=5)
self.nb_model = GaussianNB()
# Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner
self.model = StackingClassifier(
estimators=[
('xgb', self.xgb_model),
#('lgb', self.lgb_model),
('rf', self.rf_model),
('svc', self.svc_model),
('knn', self.knn_model),
('nb', self.nb_model)
],
final_estimator=LogisticRegression(),
n_jobs=10
)
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
def preprocess_train_data(self, X): def preprocess_train_data(self, X):
"""Preprocess training data by scaling and applying PCA."""
X = np.where(np.isinf(X), np.nan, X) X = np.where(np.isinf(X), np.nan, X)
X = np.nan_to_num(X) X = np.nan_to_num(X)
X = self.scaler.fit_transform(X) # Transform using the fitted scaler X = self.scaler.fit_transform(X)
return self.pca.fit_transform(X) # Fit PCA and transform return self.pca.fit_transform(X)
def preprocess_test_data(self, X): def preprocess_test_data(self, X):
"""Preprocess test data by scaling and applying PCA."""
X = np.where(np.isinf(X), np.nan, X) X = np.where(np.isinf(X), np.nan, X)
X = np.nan_to_num(X) X = np.nan_to_num(X)
X = self.scaler.transform(X) # Transform using the fitted scaler X = self.scaler.transform(X)
return self.pca.transform(X) # Transform using the fitted PCA return self.pca.transform(X)
def warm_start_training(self, X_train, y_train): def warm_start_training(self, X_train, y_train):
X_train = self.preprocess_train_data(X_train) X_train = self.preprocess_train_data(X_train)
if os.path.exists(self.warm_start_model_path): if os.path.exists(self.warm_start_model_path):
with open(f'{self.warm_start_model_path}', 'rb') as f: with open(self.warm_start_model_path, 'rb') as f:
self.model = pickle.load(f) self.model = pickle.load(f)
self.model.fit(X_train, y_train) self.model.fit(X_train, y_train)
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb')) pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
print("Warm start model saved.") print("Warm start model saved.")
def fine_tune_model(self, X_train, y_train): def fine_tune_model(self, X_train, y_train):
X_train = self.preprocess_train_data(X_train) X_train = self.preprocess_train_data(X_train)
with open(f'{self.warm_start_model_path}', 'rb') as f: with open(self.warm_start_model_path, 'rb') as f:
self.model = pickle.load(f) self.model = pickle.load(f)
self.model.fit(X_train, y_train) self.model.fit(X_train, y_train)
print("Model fine-tuned") print("Model fine-tuned")
def evaluate_model(self, X_test, y_test): def evaluate_model(self, X_test, y_test):
X_test = self.preprocess_test_data(X_test) X_test = self.preprocess_test_data(X_test)
test_predictions = self.model.predict_proba(X_test) test_predictions = self.model.predict_proba(X_test)
class_1_probabilities = test_predictions[:, 1] class_1_probabilities = test_predictions[:, 1]
binary_predictions = (class_1_probabilities >= 0.5).astype(int) binary_predictions = (class_1_probabilities >= 0.5).astype(int)
#print(test_predictions)
# Calculate and print metrics
test_precision = precision_score(y_test, binary_predictions) test_precision = precision_score(y_test, binary_predictions)
test_accuracy = accuracy_score(y_test, binary_predictions) test_accuracy = accuracy_score(y_test, binary_predictions)
test_f1_score = f1_score(y_test, binary_predictions) test_f1_score = f1_score(y_test, binary_predictions)
test_recall_score = recall_score(y_test, binary_predictions) test_recall_score = recall_score(y_test, binary_predictions)
test_roc_auc_score = roc_auc_score(y_test, binary_predictions) test_roc_auc_score = roc_auc_score(y_test, binary_predictions)
print("Test Set Metrics:") print(f"Test Precision: {round(test_precision * 100)}%")
print(f"Precision: {round(test_precision * 100)}%") print(f"Test Accuracy: {round(test_accuracy * 100)}%")
print(f"Accuracy: {round(test_accuracy * 100)}%")
print(f"F1 Score: {round(test_f1_score * 100)}%") print(f"F1 Score: {round(test_f1_score * 100)}%")
print(f"Recall Score: {round(test_recall_score * 100)}%") print(f"Recall: {round(test_recall_score * 100)}%")
print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%") print(f"ROC AUC: {round(test_roc_auc_score * 100)}%")
last_prediction_prob = class_1_probabilities[-1]
print(f"Last prediction probability: {last_prediction_prob}")
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0] thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
last_prediction_prob = class_1_probabilities[-1]
score = None score = None
print(f"Last prediction probability: {last_prediction_prob}")
for threshold, value in zip(thresholds, scores): for threshold, value in zip(thresholds, scores):
if last_prediction_prob >= threshold: if last_prediction_prob >= threshold:
score = value score = value
break break
return {'accuracy': round(test_accuracy * 100), return {
'precision': round(test_precision * 100), 'accuracy': round(test_accuracy * 100),
'f1_score': round(test_f1_score * 100), 'precision': round(test_precision * 100),
'recall_score': round(test_recall_score * 100), 'f1_score': round(test_f1_score * 100),
'roc_auc_score': round(test_roc_auc_score * 100), 'recall_score': round(test_recall_score * 100),
'score': score} 'roc_auc_score': round(test_roc_auc_score * 100),
'score': score
}

View File

@ -283,10 +283,7 @@ def run_executive():
def run_options_bubble_ticker(): def run_options_bubble_ticker():
week = datetime.today().weekday() week = datetime.today().weekday()
current_time = datetime.now().time() if week <= 4:
start_time = datetime_time(15, 30)
end_time = datetime_time(22, 30)
if week <= 4 and start_time <= current_time < end_time:
run_command(["python3", "cron_options_bubble.py"]) run_command(["python3", "cron_options_bubble.py"])
command = ["sudo", "rsync", "-avz", "-e", "ssh", "/root/backend/app/json/options-bubble", f"root@{useast_ip_address}:/root/backend/app/json"] command = ["sudo", "rsync", "-avz", "-e", "ssh", "/root/backend/app/json/options-bubble", f"root@{useast_ip_address}:/root/backend/app/json"]

View File

@ -94,19 +94,20 @@ def generate_ta_features(df):
df_features['aroon_indicator'] = aroon.aroon_indicator() df_features['aroon_indicator'] = aroon.aroon_indicator()
df_features['aroon_up'] = aroon.aroon_up() df_features['aroon_up'] = aroon.aroon_up()
df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator() #df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14) #df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index() df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) #df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60) #df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
df_features = df_features.dropna() df_features = df_features.dropna()
return df_features return df_features
def generate_statistical_features(df, windows=[50,200], price_col='close', def generate_statistical_features(df, windows=[20,50,200], price_col='close',
high_col='high', low_col='low', volume_col='volume'): high_col='high', low_col='low', volume_col='volume'):
""" """
Generate comprehensive statistical features for financial time series data. Generate comprehensive statistical features for financial time series data.
Focuses purely on statistical measures without technical indicators.
Parameters: Parameters:
----------- -----------
@ -132,7 +133,6 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
# Create a copy of the dataframe to avoid modifying the original # Create a copy of the dataframe to avoid modifying the original
df_features = df.copy() df_features = df.copy()
# Calculate features for each window size # Calculate features for each window size
for window in windows: for window in windows:
# Returns # Returns
@ -144,11 +144,18 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std() df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
# Statistical moments # Statistical moments
df_features[f'mean_{window}'] = df[price_col].rolling(window=window).mean()
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std() df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var() df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew() df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt() df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
# Quantile measures
df_features[f'quantile_25_{window}'] = df[price_col].rolling(window=window).quantile(0.25)
df_features[f'quantile_75_{window}'] = df[price_col].rolling(window=window).quantile(0.75)
df_features[f'iqr_{window}'] = (
df_features[f'quantile_75_{window}'] - df_features[f'quantile_25_{window}'])
# Volatility measures # Volatility measures
df_features[f'realized_vol_{window}'] = ( df_features[f'realized_vol_{window}'] = (
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252)) df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
@ -156,33 +163,48 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
(df[high_col].rolling(window=window).max() - (df[high_col].rolling(window=window).max() -
df[low_col].rolling(window=window).min()) / df[price_col]) df[low_col].rolling(window=window).min()) / df[price_col])
# Z-scores and normalized prices # Z-scores and normalized values
df_features[f'zscore_{window}'] = ( df_features[f'zscore_{window}'] = (
(df[price_col] - df[price_col].rolling(window=window).mean()) / (df[price_col] - df[price_col].rolling(window=window).mean()) /
df[price_col].rolling(window=window).std()) df[price_col].rolling(window=window).std())
# Volume statistics
df_features[f'volume_mean_{window}'] = df[volume_col].rolling(window=window).mean()
df_features[f'volume_std_{window}'] = df[volume_col].rolling(window=window).std()
df_features[f'volume_zscore_{window}'] = (
(df[volume_col] - df[volume_col].rolling(window=window).mean()) /
df[volume_col].rolling(window=window).std())
df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew()
df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt()
# Price dynamics # Price-volume correlations
df_features[f'price_volume_corr_{window}'] = (
df[price_col].rolling(window=window)
.corr(df[volume_col]))
# Higher-order moments of returns
returns = df[price_col].pct_change()
df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew()
df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt()
# Cross-sectional statistics
df_features['price_acceleration'] = df[price_col].diff().diff() df_features['price_acceleration'] = df[price_col].diff().diff()
df_features['momentum_change'] = df[price_col].pct_change().diff() df_features['returns_acceleration'] = df[price_col].pct_change().diff()
# Advanced volatility # Advanced volatility estimators
df_features['parkinson_vol'] = np.sqrt( df_features['parkinson_vol'] = np.sqrt(
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2)) 1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
# Efficiency ratio df_features['garman_klass_vol'] = np.sqrt(
df_features['price_efficiency'] = ( 0.5 * np.log(df[high_col]/df[low_col])**2 -
abs(df[price_col] - df[price_col].shift(20)) / (2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2
(df[high_col].rolling(20).max() - df[low_col].rolling(20).min())
) )
# Deviation metrics # Dispersion measures
df_features['deviation_from_vwap'] = ( df_features['price_range'] = df[high_col] - df[low_col]
(df[price_col] - df[price_col].rolling(window=20).mean()) / df_features['price_range_pct'] = df_features['price_range'] / df[price_col]
df[price_col].rolling(window=20).mean()
)
df_features['stock_return'] = df['close'].pct_change()
# Clean up any NaN values
df_features = df_features.dropna() df_features = df_features.dropna()
return df_features return df_features