bugfixing options bubble data
This commit is contained in:
parent
1ee1e10e72
commit
0513fced3d
@ -119,7 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
|
|
||||||
#Threshold of enough datapoints needed!
|
#Threshold of enough datapoints needed!
|
||||||
if len(ratios) < 50:
|
if len(ratios) < 50:
|
||||||
print('Not enough data points')
|
print(f'Not enough data points for {ticker}')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@ -225,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
# Compute combinations for each group of columns
|
# Compute combinations for each group of columns
|
||||||
compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
||||||
compute_column_ratios(stats_columns, df_combined, new_columns)
|
compute_column_ratios(stats_columns, df_combined, new_columns)
|
||||||
#compute_column_ratios(ta_columns, df_combined, new_columns)
|
compute_column_ratios(ta_columns, df_combined, new_columns)
|
||||||
|
|
||||||
# Concatenate the new ratio columns with the original DataFrame
|
# Concatenate the new ratio columns with the original DataFrame
|
||||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
||||||
@ -272,6 +272,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
chunk_results = await asyncio.gather(*tasks)
|
chunk_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
train_list = []
|
train_list = []
|
||||||
|
test_list = []
|
||||||
|
|
||||||
for ticker, df in zip(chunk, chunk_results):
|
for ticker, df in zip(chunk, chunk_results):
|
||||||
try:
|
try:
|
||||||
@ -280,24 +281,19 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
train_data = df.iloc[:split_size]
|
train_data = df.iloc[:split_size]
|
||||||
test_data = df.iloc[split_size:]
|
test_data = df.iloc[split_size:]
|
||||||
|
|
||||||
# Store test data for this ticker in a dictionary
|
|
||||||
df_test_dict[ticker] = test_data
|
|
||||||
|
|
||||||
# Append train data for combined training
|
# Append train data for combined training
|
||||||
train_list.append(train_data)
|
train_list.append(train_data)
|
||||||
|
test_list.append(test_data)
|
||||||
# Collect all test data for overall evaluation
|
|
||||||
all_test_data.append(test_data)
|
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Concatenate all train data together
|
# Concatenate all train data together
|
||||||
if train_list:
|
df_train = pd.concat(train_list, ignore_index=True)
|
||||||
df_train = pd.concat(train_list, ignore_index=True)
|
df_test = pd.concat(test_list, ignore_index=True)
|
||||||
|
|
||||||
# Shuffle the combined training data
|
# Shuffle the combined training data
|
||||||
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
|
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
|
||||||
print('====== Start Training Model on Combined Data ======')
|
print('====== Start Training Model on Combined Data ======')
|
||||||
predictor = ScorePredictor()
|
predictor = ScorePredictor()
|
||||||
@ -308,40 +304,50 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
print(f'Training complete on {len(df_train)} samples.')
|
print(f'Training complete on {len(df_train)} samples.')
|
||||||
|
|
||||||
# Evaluate the model on the overall test dataset
|
# Evaluate the model on the overall test dataset
|
||||||
if all_test_data:
|
print('====== Evaluating on Overall Test Dataset ======')
|
||||||
overall_test_data = pd.concat(all_test_data, ignore_index=True)
|
data = predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
||||||
print('====== Evaluating on Overall Test Dataset ======')
|
print(f'Overall Evaluation Metrics: {data}')
|
||||||
overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
|
|
||||||
print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
|
|
||||||
|
|
||||||
# Evaluate the model for each ticker separately
|
|
||||||
for ticker, test_data in df_test_dict.items():
|
|
||||||
try:
|
|
||||||
print(f"Fine-tuning the model for {ticker}")
|
|
||||||
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
|
|
||||||
|
|
||||||
print(f"Evaluating model for {ticker}")
|
|
||||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
|
||||||
|
|
||||||
# Check if the evaluation data meets the criteria
|
|
||||||
|
|
||||||
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
|
||||||
data['accuracy'] < 100 and data['precision'] < 100 and
|
|
||||||
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
|
||||||
data['roc_auc_score'] >= 50):
|
|
||||||
# Save the evaluation data to a JSON file
|
|
||||||
await save_json(ticker, data)
|
|
||||||
print(f"Saved results for {ticker}")
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def warm_start_training(tickers, con, skip_downloading):
|
async def warm_start_training(tickers, con, skip_downloading):
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
|
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
|
||||||
|
|
||||||
|
|
||||||
|
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading):
|
||||||
|
try:
|
||||||
|
df_train = pd.DataFrame()
|
||||||
|
df_test_dict = {} # Store test data for each ticker
|
||||||
|
all_test_data = [] # Store all test data for overall evaluation
|
||||||
|
|
||||||
|
df = await download_data(ticker, con, start_date, end_date, skip_downloading)
|
||||||
|
split_size = int(len(df) * (1 - test_size))
|
||||||
|
df_train = df.iloc[:split_size]
|
||||||
|
df_test = df.iloc[split_size:]
|
||||||
|
|
||||||
|
# Shuffle the combined training data
|
||||||
|
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
|
||||||
|
print('====== Start Fine-tuning Model ======')
|
||||||
|
predictor = ScorePredictor()
|
||||||
|
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
|
||||||
|
|
||||||
|
# Train the model on the combined training data
|
||||||
|
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
|
||||||
|
print(f'Training complete on {len(df_train)} samples.')
|
||||||
|
print(f"Evaluating model for {ticker}")
|
||||||
|
data = predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
||||||
|
print(f'Overall Evaluation Metrics: {data}')
|
||||||
|
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
||||||
|
data['accuracy'] < 100 and data['precision'] < 100 and
|
||||||
|
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
||||||
|
data['roc_auc_score'] >= 50):
|
||||||
|
# Save the evaluation data to a JSON file
|
||||||
|
await save_json(ticker, data)
|
||||||
|
print(f"Saved results for {ticker}")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
train_mode = True # Set this to False for fine-tuning and evaluation
|
train_mode = True # Set this to False for fine-tuning and evaluation
|
||||||
skip_downloading = False
|
skip_downloading = False
|
||||||
@ -351,6 +357,14 @@ async def run():
|
|||||||
|
|
||||||
if train_mode:
|
if train_mode:
|
||||||
# Warm start training
|
# Warm start training
|
||||||
|
warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
|
||||||
|
|
||||||
|
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
||||||
|
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
||||||
|
else:
|
||||||
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
|
test_size = 0.2
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
SELECT DISTINCT symbol
|
SELECT DISTINCT symbol
|
||||||
FROM stocks
|
FROM stocks
|
||||||
@ -358,11 +372,11 @@ async def run():
|
|||||||
AND symbol NOT LIKE '%.%'
|
AND symbol NOT LIKE '%.%'
|
||||||
AND symbol NOT LIKE '%-%'
|
AND symbol NOT LIKE '%-%'
|
||||||
""")
|
""")
|
||||||
warm_start_symbols = ['PEP'] #[row[0] for row in cursor.fetchall()]
|
stock_symbols = [row[0] for row in cursor.fetchall()]
|
||||||
|
for ticker in tqdm(stock_symbols):
|
||||||
|
await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading)
|
||||||
|
|
||||||
|
|
||||||
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
|
||||||
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
|
||||||
|
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -42,11 +42,13 @@ def options_bubble_data(chunk):
|
|||||||
start_date_str = start_date.strftime('%Y-%m-%d')
|
start_date_str = start_date.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
res_list = []
|
res_list = []
|
||||||
for page in range(0, 5000):
|
page = 0
|
||||||
|
while True:
|
||||||
try:
|
try:
|
||||||
data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str)
|
data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str)
|
||||||
data = ujson.loads(fin.output(data))['option_activity']
|
data = ujson.loads(fin.output(data))['option_activity']
|
||||||
res_list += data
|
res_list += data
|
||||||
|
page +=1
|
||||||
except:
|
except:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -54,33 +56,39 @@ def options_bubble_data(chunk):
|
|||||||
|
|
||||||
for option_type in ['CALL', 'PUT']:
|
for option_type in ['CALL', 'PUT']:
|
||||||
for item in res_filtered:
|
for item in res_filtered:
|
||||||
if item['put_call'].upper() == option_type:
|
try:
|
||||||
item['dte'] = calculate_dte(item['date_expiration'])
|
if item['put_call'].upper() == option_type:
|
||||||
if item['ticker'] in ['BRK.A', 'BRK.B']:
|
item['dte'] = calculate_dte(item['date_expiration'])
|
||||||
item['ticker'] = f"BRK-{item['ticker'][-1]}"
|
if item['ticker'] in ['BRK.A', 'BRK.B']:
|
||||||
|
item['ticker'] = f"BRK-{item['ticker'][-1]}"
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
#Save raw data for each ticker for options page stack bar chart
|
#Save raw data for each ticker for options page stack bar chart
|
||||||
for ticker in chunk:
|
for ticker in chunk:
|
||||||
ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker]
|
try:
|
||||||
if len(ticker_filtered_data) != 0:
|
ticker_filtered_data = [entry for entry in res_filtered if entry['ticker'] == ticker]
|
||||||
#sum up calls and puts for each day for the plot
|
if len(ticker_filtered_data) != 0:
|
||||||
summed_data = {}
|
#sum up calls and puts for each day for the plot
|
||||||
for entry in ticker_filtered_data:
|
summed_data = {}
|
||||||
volume = int(entry['volume'])
|
for entry in ticker_filtered_data:
|
||||||
open_interest = int(entry['open_interest'])
|
volume = int(entry['volume'])
|
||||||
put_call = entry['put_call']
|
open_interest = int(entry['open_interest'])
|
||||||
|
put_call = entry['put_call']
|
||||||
if entry['date'] not in summed_data:
|
|
||||||
summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}}
|
if entry['date'] not in summed_data:
|
||||||
|
summed_data[entry['date']] = {'CALL': {'volume': 0, 'open_interest': 0}, 'PUT': {'volume': 0, 'open_interest': 0}}
|
||||||
summed_data[entry['date']][put_call]['volume'] += volume
|
|
||||||
summed_data[entry['date']][put_call]['open_interest'] += open_interest
|
summed_data[entry['date']][put_call]['volume'] += volume
|
||||||
|
summed_data[entry['date']][put_call]['open_interest'] += open_interest
|
||||||
|
|
||||||
result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data]
|
result_list = [{'date': date, 'CALL': summed_data[date]['CALL'], 'PUT': summed_data[date]['PUT']} for date in summed_data]
|
||||||
#reverse the list
|
#reverse the list
|
||||||
result_list = result_list[::-1]
|
result_list = result_list[::-1]
|
||||||
with open(f"json/options-flow/company/{ticker}.json", 'w') as file:
|
with open(f"json/options-flow/company/{ticker}.json", 'w') as file:
|
||||||
ujson.dump(result_list, file)
|
ujson.dump(result_list, file)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
#Save bubble data for each ticker for overview page
|
#Save bubble data for each ticker for overview page
|
||||||
for ticker in chunk:
|
for ticker in chunk:
|
||||||
@ -131,7 +139,7 @@ async def main():
|
|||||||
|
|
||||||
chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks
|
chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks
|
||||||
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
|
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
|
||||||
print(chunks)
|
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks]
|
tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks]
|
||||||
|
|||||||
Binary file not shown.
@ -23,7 +23,7 @@ class StockPredictor:
|
|||||||
self.ticker = ticker
|
self.ticker = ticker
|
||||||
self.start_date = start_date
|
self.start_date = start_date
|
||||||
self.end_date = end_date
|
self.end_date = end_date
|
||||||
self.nth_day = 60
|
self.nth_day = 10
|
||||||
self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic')
|
self.model = None #RandomForestClassifier(n_estimators=3500, min_samples_split=100, random_state=42, n_jobs=-1) #XGBClassifier(n_estimators=200, max_depth=2, learning_rate=1, objective='binary:logistic')
|
||||||
self.horizons = [3,5,10, 15, 20]
|
self.horizons = [3,5,10, 15, 20]
|
||||||
self.test_size = 0.2
|
self.test_size = 0.2
|
||||||
@ -134,19 +134,19 @@ class StockPredictor:
|
|||||||
model.add(Dropout(0.2))
|
model.add(Dropout(0.2))
|
||||||
model.add(Dense(units=1, activation='sigmoid'))
|
model.add(Dense(units=1, activation='sigmoid'))
|
||||||
|
|
||||||
# Learning rate scheduler
|
|
||||||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
|
|
||||||
# Early stopping
|
|
||||||
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
|
|
||||||
|
|
||||||
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
|
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
||||||
|
|
||||||
return model, [reduce_lr, early_stop]
|
return model
|
||||||
|
|
||||||
|
|
||||||
def train_model(self, X_train, y_train):
|
def train_model(self, X_train, y_train):
|
||||||
self.model, callbacks = self.build_lstm_model((X_train.shape[1], X_train.shape[2]))
|
# Learning rate scheduler
|
||||||
history = self.model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.1, callbacks=callbacks)
|
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
|
||||||
|
# Early stopping
|
||||||
|
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
|
||||||
|
self.model = self.build_lstm_model((X_train.shape[1], X_train.shape[2]))
|
||||||
|
history = self.model.fit(X_train, y_train, epochs=500, batch_size=1024, validation_split=0.1, callbacks=[early_stop])
|
||||||
|
|
||||||
def evaluate_model(self, X_test, y_test):
|
def evaluate_model(self, X_test, y_test):
|
||||||
# Reshape X_test to remove the extra dimension
|
# Reshape X_test to remove the extra dimension
|
||||||
@ -202,7 +202,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
X = df[predictors].values
|
X = df[predictors].values
|
||||||
y = df['Target'].values
|
y = df['Target'].values
|
||||||
|
print(df)
|
||||||
# Normalize features
|
# Normalize features
|
||||||
scaler = MinMaxScaler(feature_range=(0, 1))
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
||||||
X = scaler.fit_transform(X)
|
X = scaler.fit_transform(X)
|
||||||
|
|||||||
@ -2,8 +2,13 @@ import pandas as pd
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from xgboost import XGBClassifier
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
import lightgbm as lgb
|
import lightgbm as lgb
|
||||||
|
|
||||||
@ -19,97 +24,119 @@ import os
|
|||||||
|
|
||||||
class ScorePredictor:
|
class ScorePredictor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scaler = MinMaxScaler()
|
self.scaler = StandardScaler()
|
||||||
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
self.pca = PCA(n_components=0.95)
|
||||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
|
||||||
self.model = lgb.LGBMClassifier(
|
# Define base models
|
||||||
n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time
|
self.xgb_model = XGBClassifier(
|
||||||
learning_rate=0.001, # Smaller learning rate for better generalization
|
n_estimators=100,
|
||||||
max_depth=6, # Controlled depth to prevent overfitting
|
max_depth=10,
|
||||||
num_leaves=2**6-1, # 2^max_depth, prevents overfitting while maintaining model complexity
|
learning_rate=0.001,
|
||||||
colsample_bytree=0.1,
|
random_state=42,
|
||||||
n_jobs=10, # Use N CPU cores
|
n_jobs=10,
|
||||||
verbose=0, # Reduce output noise
|
tree_method='gpu_hist',
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
XGBClassifier(
|
self.lgb_model = lgb.LGBMClassifier(
|
||||||
n_estimators=200,
|
n_estimators=100,
|
||||||
max_depth=5,
|
learning_rate=0.001,
|
||||||
learning_rate=0.1,
|
max_depth=10,
|
||||||
random_state=42,
|
|
||||||
n_jobs=10
|
n_jobs=10
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
|
self.rf_model = RandomForestClassifier(
|
||||||
|
n_estimators=100,
|
||||||
|
max_depth=10,
|
||||||
|
random_state=42,
|
||||||
|
n_jobs=10
|
||||||
|
)
|
||||||
|
|
||||||
|
self.svc_model = SVC(probability=True, kernel='rbf')
|
||||||
|
self.knn_model = KNeighborsClassifier(n_neighbors=5)
|
||||||
|
self.nb_model = GaussianNB()
|
||||||
|
|
||||||
|
|
||||||
|
# Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner
|
||||||
|
self.model = StackingClassifier(
|
||||||
|
estimators=[
|
||||||
|
('xgb', self.xgb_model),
|
||||||
|
#('lgb', self.lgb_model),
|
||||||
|
('rf', self.rf_model),
|
||||||
|
('svc', self.svc_model),
|
||||||
|
('knn', self.knn_model),
|
||||||
|
('nb', self.nb_model)
|
||||||
|
],
|
||||||
|
final_estimator=LogisticRegression(),
|
||||||
|
n_jobs=10
|
||||||
|
)
|
||||||
|
|
||||||
|
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
|
||||||
|
|
||||||
def preprocess_train_data(self, X):
|
def preprocess_train_data(self, X):
|
||||||
"""Preprocess training data by scaling and applying PCA."""
|
|
||||||
X = np.where(np.isinf(X), np.nan, X)
|
X = np.where(np.isinf(X), np.nan, X)
|
||||||
X = np.nan_to_num(X)
|
X = np.nan_to_num(X)
|
||||||
X = self.scaler.fit_transform(X) # Transform using the fitted scaler
|
X = self.scaler.fit_transform(X)
|
||||||
return self.pca.fit_transform(X) # Fit PCA and transform
|
return self.pca.fit_transform(X)
|
||||||
|
|
||||||
def preprocess_test_data(self, X):
|
def preprocess_test_data(self, X):
|
||||||
"""Preprocess test data by scaling and applying PCA."""
|
|
||||||
X = np.where(np.isinf(X), np.nan, X)
|
X = np.where(np.isinf(X), np.nan, X)
|
||||||
X = np.nan_to_num(X)
|
X = np.nan_to_num(X)
|
||||||
X = self.scaler.transform(X) # Transform using the fitted scaler
|
X = self.scaler.transform(X)
|
||||||
return self.pca.transform(X) # Transform using the fitted PCA
|
return self.pca.transform(X)
|
||||||
|
|
||||||
def warm_start_training(self, X_train, y_train):
|
def warm_start_training(self, X_train, y_train):
|
||||||
X_train = self.preprocess_train_data(X_train)
|
X_train = self.preprocess_train_data(X_train)
|
||||||
if os.path.exists(self.warm_start_model_path):
|
if os.path.exists(self.warm_start_model_path):
|
||||||
with open(f'{self.warm_start_model_path}', 'rb') as f:
|
with open(self.warm_start_model_path, 'rb') as f:
|
||||||
self.model = pickle.load(f)
|
self.model = pickle.load(f)
|
||||||
self.model.fit(X_train, y_train)
|
self.model.fit(X_train, y_train)
|
||||||
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
|
pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
|
||||||
print("Warm start model saved.")
|
print("Warm start model saved.")
|
||||||
|
|
||||||
|
|
||||||
def fine_tune_model(self, X_train, y_train):
|
def fine_tune_model(self, X_train, y_train):
|
||||||
X_train = self.preprocess_train_data(X_train)
|
X_train = self.preprocess_train_data(X_train)
|
||||||
with open(f'{self.warm_start_model_path}', 'rb') as f:
|
with open(self.warm_start_model_path, 'rb') as f:
|
||||||
self.model = pickle.load(f)
|
self.model = pickle.load(f)
|
||||||
|
|
||||||
self.model.fit(X_train, y_train)
|
self.model.fit(X_train, y_train)
|
||||||
print("Model fine-tuned")
|
print("Model fine-tuned")
|
||||||
|
|
||||||
|
|
||||||
def evaluate_model(self, X_test, y_test):
|
def evaluate_model(self, X_test, y_test):
|
||||||
X_test = self.preprocess_test_data(X_test)
|
X_test = self.preprocess_test_data(X_test)
|
||||||
|
|
||||||
test_predictions = self.model.predict_proba(X_test)
|
test_predictions = self.model.predict_proba(X_test)
|
||||||
class_1_probabilities = test_predictions[:, 1]
|
class_1_probabilities = test_predictions[:, 1]
|
||||||
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
||||||
#print(test_predictions)
|
|
||||||
|
# Calculate and print metrics
|
||||||
test_precision = precision_score(y_test, binary_predictions)
|
test_precision = precision_score(y_test, binary_predictions)
|
||||||
test_accuracy = accuracy_score(y_test, binary_predictions)
|
test_accuracy = accuracy_score(y_test, binary_predictions)
|
||||||
test_f1_score = f1_score(y_test, binary_predictions)
|
test_f1_score = f1_score(y_test, binary_predictions)
|
||||||
test_recall_score = recall_score(y_test, binary_predictions)
|
test_recall_score = recall_score(y_test, binary_predictions)
|
||||||
test_roc_auc_score = roc_auc_score(y_test, binary_predictions)
|
test_roc_auc_score = roc_auc_score(y_test, binary_predictions)
|
||||||
|
|
||||||
print("Test Set Metrics:")
|
print(f"Test Precision: {round(test_precision * 100)}%")
|
||||||
print(f"Precision: {round(test_precision * 100)}%")
|
print(f"Test Accuracy: {round(test_accuracy * 100)}%")
|
||||||
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
|
||||||
print(f"F1 Score: {round(test_f1_score * 100)}%")
|
print(f"F1 Score: {round(test_f1_score * 100)}%")
|
||||||
print(f"Recall Score: {round(test_recall_score * 100)}%")
|
print(f"Recall: {round(test_recall_score * 100)}%")
|
||||||
print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%")
|
print(f"ROC AUC: {round(test_roc_auc_score * 100)}%")
|
||||||
|
|
||||||
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
|
last_prediction_prob = class_1_probabilities[-1]
|
||||||
|
print(f"Last prediction probability: {last_prediction_prob}")
|
||||||
|
|
||||||
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
||||||
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||||
|
|
||||||
last_prediction_prob = class_1_probabilities[-1]
|
score = None
|
||||||
score = None
|
|
||||||
print(f"Last prediction probability: {last_prediction_prob}")
|
|
||||||
|
|
||||||
for threshold, value in zip(thresholds, scores):
|
for threshold, value in zip(thresholds, scores):
|
||||||
if last_prediction_prob >= threshold:
|
if last_prediction_prob >= threshold:
|
||||||
score = value
|
score = value
|
||||||
break
|
break
|
||||||
|
|
||||||
return {'accuracy': round(test_accuracy * 100),
|
return {
|
||||||
'precision': round(test_precision * 100),
|
'accuracy': round(test_accuracy * 100),
|
||||||
'f1_score': round(test_f1_score * 100),
|
'precision': round(test_precision * 100),
|
||||||
'recall_score': round(test_recall_score * 100),
|
'f1_score': round(test_f1_score * 100),
|
||||||
'roc_auc_score': round(test_roc_auc_score * 100),
|
'recall_score': round(test_recall_score * 100),
|
||||||
'score': score}
|
'roc_auc_score': round(test_roc_auc_score * 100),
|
||||||
|
'score': score
|
||||||
|
}
|
||||||
@ -283,10 +283,7 @@ def run_executive():
|
|||||||
|
|
||||||
def run_options_bubble_ticker():
|
def run_options_bubble_ticker():
|
||||||
week = datetime.today().weekday()
|
week = datetime.today().weekday()
|
||||||
current_time = datetime.now().time()
|
if week <= 4:
|
||||||
start_time = datetime_time(15, 30)
|
|
||||||
end_time = datetime_time(22, 30)
|
|
||||||
if week <= 4 and start_time <= current_time < end_time:
|
|
||||||
run_command(["python3", "cron_options_bubble.py"])
|
run_command(["python3", "cron_options_bubble.py"])
|
||||||
|
|
||||||
command = ["sudo", "rsync", "-avz", "-e", "ssh", "/root/backend/app/json/options-bubble", f"root@{useast_ip_address}:/root/backend/app/json"]
|
command = ["sudo", "rsync", "-avz", "-e", "ssh", "/root/backend/app/json/options-bubble", f"root@{useast_ip_address}:/root/backend/app/json"]
|
||||||
|
|||||||
Binary file not shown.
@ -94,19 +94,20 @@ def generate_ta_features(df):
|
|||||||
df_features['aroon_indicator'] = aroon.aroon_indicator()
|
df_features['aroon_indicator'] = aroon.aroon_indicator()
|
||||||
df_features['aroon_up'] = aroon.aroon_up()
|
df_features['aroon_up'] = aroon.aroon_up()
|
||||||
|
|
||||||
df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
|
#df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
|
||||||
df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
|
#df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
|
||||||
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
|
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
|
||||||
df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
#df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
||||||
df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
#df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
||||||
|
|
||||||
df_features = df_features.dropna()
|
df_features = df_features.dropna()
|
||||||
return df_features
|
return df_features
|
||||||
|
|
||||||
def generate_statistical_features(df, windows=[50,200], price_col='close',
|
def generate_statistical_features(df, windows=[20,50,200], price_col='close',
|
||||||
high_col='high', low_col='low', volume_col='volume'):
|
high_col='high', low_col='low', volume_col='volume'):
|
||||||
"""
|
"""
|
||||||
Generate comprehensive statistical features for financial time series data.
|
Generate comprehensive statistical features for financial time series data.
|
||||||
|
Focuses purely on statistical measures without technical indicators.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
-----------
|
-----------
|
||||||
@ -132,7 +133,6 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
|
|||||||
# Create a copy of the dataframe to avoid modifying the original
|
# Create a copy of the dataframe to avoid modifying the original
|
||||||
df_features = df.copy()
|
df_features = df.copy()
|
||||||
|
|
||||||
|
|
||||||
# Calculate features for each window size
|
# Calculate features for each window size
|
||||||
for window in windows:
|
for window in windows:
|
||||||
# Returns
|
# Returns
|
||||||
@ -144,11 +144,18 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
|
|||||||
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
|
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
|
||||||
|
|
||||||
# Statistical moments
|
# Statistical moments
|
||||||
|
df_features[f'mean_{window}'] = df[price_col].rolling(window=window).mean()
|
||||||
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
|
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
|
||||||
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
|
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
|
||||||
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
|
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
|
||||||
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
|
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
|
||||||
|
|
||||||
|
# Quantile measures
|
||||||
|
df_features[f'quantile_25_{window}'] = df[price_col].rolling(window=window).quantile(0.25)
|
||||||
|
df_features[f'quantile_75_{window}'] = df[price_col].rolling(window=window).quantile(0.75)
|
||||||
|
df_features[f'iqr_{window}'] = (
|
||||||
|
df_features[f'quantile_75_{window}'] - df_features[f'quantile_25_{window}'])
|
||||||
|
|
||||||
# Volatility measures
|
# Volatility measures
|
||||||
df_features[f'realized_vol_{window}'] = (
|
df_features[f'realized_vol_{window}'] = (
|
||||||
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
|
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
|
||||||
@ -156,33 +163,48 @@ def generate_statistical_features(df, windows=[50,200], price_col='close',
|
|||||||
(df[high_col].rolling(window=window).max() -
|
(df[high_col].rolling(window=window).max() -
|
||||||
df[low_col].rolling(window=window).min()) / df[price_col])
|
df[low_col].rolling(window=window).min()) / df[price_col])
|
||||||
|
|
||||||
# Z-scores and normalized prices
|
# Z-scores and normalized values
|
||||||
df_features[f'zscore_{window}'] = (
|
df_features[f'zscore_{window}'] = (
|
||||||
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
||||||
df[price_col].rolling(window=window).std())
|
df[price_col].rolling(window=window).std())
|
||||||
|
|
||||||
|
# Volume statistics
|
||||||
# Price dynamics
|
df_features[f'volume_mean_{window}'] = df[volume_col].rolling(window=window).mean()
|
||||||
|
df_features[f'volume_std_{window}'] = df[volume_col].rolling(window=window).std()
|
||||||
|
df_features[f'volume_zscore_{window}'] = (
|
||||||
|
(df[volume_col] - df[volume_col].rolling(window=window).mean()) /
|
||||||
|
df[volume_col].rolling(window=window).std())
|
||||||
|
df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew()
|
||||||
|
df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt()
|
||||||
|
|
||||||
|
# Price-volume correlations
|
||||||
|
df_features[f'price_volume_corr_{window}'] = (
|
||||||
|
df[price_col].rolling(window=window)
|
||||||
|
.corr(df[volume_col]))
|
||||||
|
|
||||||
|
# Higher-order moments of returns
|
||||||
|
returns = df[price_col].pct_change()
|
||||||
|
df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew()
|
||||||
|
df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt()
|
||||||
|
|
||||||
|
# Cross-sectional statistics
|
||||||
df_features['price_acceleration'] = df[price_col].diff().diff()
|
df_features['price_acceleration'] = df[price_col].diff().diff()
|
||||||
df_features['momentum_change'] = df[price_col].pct_change().diff()
|
df_features['returns_acceleration'] = df[price_col].pct_change().diff()
|
||||||
|
|
||||||
# Advanced volatility
|
# Advanced volatility estimators
|
||||||
df_features['parkinson_vol'] = np.sqrt(
|
df_features['parkinson_vol'] = np.sqrt(
|
||||||
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
|
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
|
||||||
|
|
||||||
# Efficiency ratio
|
df_features['garman_klass_vol'] = np.sqrt(
|
||||||
df_features['price_efficiency'] = (
|
0.5 * np.log(df[high_col]/df[low_col])**2 -
|
||||||
abs(df[price_col] - df[price_col].shift(20)) /
|
(2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2
|
||||||
(df[high_col].rolling(20).max() - df[low_col].rolling(20).min())
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Deviation metrics
|
# Dispersion measures
|
||||||
df_features['deviation_from_vwap'] = (
|
df_features['price_range'] = df[high_col] - df[low_col]
|
||||||
(df[price_col] - df[price_col].rolling(window=20).mean()) /
|
df_features['price_range_pct'] = df_features['price_range'] / df[price_col]
|
||||||
df[price_col].rolling(window=20).mean()
|
|
||||||
)
|
|
||||||
|
|
||||||
df_features['stock_return'] = df['close'].pct_change()
|
|
||||||
|
|
||||||
|
# Clean up any NaN values
|
||||||
df_features = df_features.dropna()
|
df_features = df_features.dropna()
|
||||||
return df_features
|
|
||||||
|
return df_features
|
||||||
Loading…
x
Reference in New Issue
Block a user