bugfixing options cron job
This commit is contained in:
parent
f7f26b74c2
commit
99e00425ac
@ -77,14 +77,14 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75)
|
||||
selected_features.append(feature)
|
||||
return selected_features
|
||||
|
||||
async def download_data(ticker, con, start_date, end_date):
|
||||
async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
|
||||
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
||||
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
return pd.DataFrame(orjson.loads(file.read()))
|
||||
else:
|
||||
elif skip_downloading == False:
|
||||
|
||||
try:
|
||||
# Define paths to the statement files
|
||||
@ -213,7 +213,7 @@ async def download_data(ticker, con, start_date, end_date):
|
||||
combined_data = sorted(combined_data, key=lambda x: x['date'])
|
||||
# Convert combined data into a DataFrame
|
||||
df_combined = pd.DataFrame(combined_data).dropna()
|
||||
'''
|
||||
|
||||
fundamental_columns = [
|
||||
'revenue',
|
||||
'costOfRevenue',
|
||||
@ -262,7 +262,7 @@ async def download_data(ticker, con, start_date, end_date):
|
||||
# Compute ratios for all combinations of key elements
|
||||
new_columns = {}
|
||||
# Loop over combinations of column pairs
|
||||
for columns in [fundamental_columns]:
|
||||
for columns in [fundamental_columns, stats_columns, ta_columns]:
|
||||
for num, denom in combinations(columns, 2):
|
||||
# Compute ratio and reverse ratio
|
||||
ratio = df_combined[num] / df_combined[denom]
|
||||
@ -278,7 +278,7 @@ async def download_data(ticker, con, start_date, end_date):
|
||||
|
||||
# Add all new columns to the original DataFrame at once
|
||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
|
||||
'''
|
||||
|
||||
# To defragment the DataFrame, make a copy
|
||||
df_combined = df_combined.copy()
|
||||
df_combined = df_combined.dropna()
|
||||
@ -301,7 +301,7 @@ async def download_data(ticker, con, start_date, end_date):
|
||||
pass
|
||||
|
||||
|
||||
async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
async def chunked_gather(tickers, con, start_date, end_date, skip_downloading, chunk_size=10):
|
||||
# Helper function to divide the tickers into chunks
|
||||
def chunks(lst, size):
|
||||
for i in range(0, len(lst), size):
|
||||
@ -309,9 +309,9 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
|
||||
results = []
|
||||
|
||||
for chunk in chunks(tickers, chunk_size):
|
||||
for chunk in tqdm(chunks(tickers, chunk_size)):
|
||||
# Create tasks for each chunk
|
||||
tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk]
|
||||
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
|
||||
# Await the results for the current chunk
|
||||
chunk_results = await asyncio.gather(*tasks)
|
||||
# Accumulate the results
|
||||
@ -319,14 +319,14 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
|
||||
return results
|
||||
|
||||
async def warm_start_training(tickers, con):
|
||||
async def warm_start_training(tickers, con, skip_downloading):
|
||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||
df_train = pd.DataFrame()
|
||||
df_test = pd.DataFrame()
|
||||
test_size = 0.2
|
||||
|
||||
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
|
||||
dfs = await chunked_gather(tickers, con, start_date, end_date, skip_downloading, chunk_size=10)
|
||||
|
||||
train_list = []
|
||||
test_list = []
|
||||
@ -359,9 +359,9 @@ async def warm_start_training(tickers, con):
|
||||
|
||||
return predictor
|
||||
|
||||
async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading):
|
||||
try:
|
||||
df = await download_data(ticker,con, start_date, end_date)
|
||||
df = await download_data(ticker,con, start_date, end_date, skip_downloading)
|
||||
if df is None or len(df) == 0:
|
||||
print(f"No data available for {ticker}")
|
||||
return
|
||||
@ -371,7 +371,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
train_data = df.iloc[:split_size]
|
||||
test_data = df.iloc[split_size:]
|
||||
|
||||
selected_features = top_uncorrelated_features(train_data,top_n=50) #[col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
|
||||
selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
|
||||
# Fine-tune the model
|
||||
predictor = ScorePredictor()
|
||||
predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
|
||||
@ -380,9 +380,8 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||
|
||||
if len(data) != 0:
|
||||
if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100:
|
||||
res = {'score': data['score']}
|
||||
await save_json(ticker, res)
|
||||
if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] > 50 and data['recall_score'] > 50 and data['roc_auc_score'] > 50:
|
||||
await save_json(ticker, data)
|
||||
print(f"Saved results for {ticker}")
|
||||
gc.collect()
|
||||
except Exception as e:
|
||||
@ -394,16 +393,17 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
|
||||
async def run():
|
||||
train_mode = True # Set this to False for fine-tuning and evaluation
|
||||
skip_downloading = False
|
||||
con = sqlite3.connect('stocks.db')
|
||||
cursor = con.cursor()
|
||||
cursor.execute("PRAGMA journal_mode = wal")
|
||||
|
||||
if train_mode:
|
||||
# Warm start training
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
warm_start_symbols = [row[0] for row in cursor.fetchall()]
|
||||
print('Warm Start Training for:', warm_start_symbols)
|
||||
predictor = await warm_start_training(warm_start_symbols, con)
|
||||
print('Warm Start Training')
|
||||
predictor = await warm_start_training(warm_start_symbols, con, skip_downloading)
|
||||
else:
|
||||
# Fine-tuning and evaluation for all stocks
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
|
||||
@ -416,7 +416,7 @@ async def run():
|
||||
|
||||
tasks = []
|
||||
for ticker in tqdm(stock_symbols):
|
||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date)
|
||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading)
|
||||
|
||||
con.close()
|
||||
|
||||
|
||||
@ -84,6 +84,7 @@ async def get_historical_data(ticker, query_con, session):
|
||||
|
||||
async def run():
|
||||
total_symbols = []
|
||||
chunk_size = 400
|
||||
try:
|
||||
cursor = con.cursor()
|
||||
cursor.execute("PRAGMA journal_mode = wal")
|
||||
@ -130,8 +131,6 @@ try:
|
||||
start_date_max = datetime(1970, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = end_date.strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
chunk_size = 400
|
||||
asyncio.run(run())
|
||||
con.close()
|
||||
etf_con.close()
|
||||
|
||||
@ -42,7 +42,7 @@ def options_bubble_data(chunk):
|
||||
start_date_str = start_date.strftime('%Y-%m-%d')
|
||||
|
||||
res_list = []
|
||||
for page in range(0, 500):
|
||||
for page in range(0, 5000):
|
||||
try:
|
||||
data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str)
|
||||
data = ujson.loads(fin.output(data))['option_activity']
|
||||
@ -129,11 +129,11 @@ async def main():
|
||||
|
||||
print(len(total_symbols))
|
||||
|
||||
chunk_size = len(total_symbols) // 1000 # Divide the list into N chunks
|
||||
chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks
|
||||
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
|
||||
|
||||
print(chunks)
|
||||
loop = asyncio.get_running_loop()
|
||||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks]
|
||||
for f in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
|
||||
await f
|
||||
|
||||
@ -335,16 +335,14 @@ def get_options_chain(option_data_list):
|
||||
|
||||
def get_data(ticker):
|
||||
res_list = []
|
||||
page = 0
|
||||
while True:
|
||||
|
||||
for page in range(0,5000):
|
||||
try:
|
||||
data = fin.options_activity(date_from=start_date_str, date_to=end_date_str, company_tickers=ticker, page=page, pagesize=1000)
|
||||
data = ujson.loads(fin.output(data))['option_activity']
|
||||
filtered_data = [{key: value for key, value in item.items() if key not in ['description_extended', 'updated']} for item in data]
|
||||
res_list += filtered_data
|
||||
page += 1
|
||||
except Exception as e:
|
||||
print(f"Error retrieving data for {ticker}: {e}")
|
||||
except:
|
||||
break
|
||||
return res_list
|
||||
|
||||
@ -369,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal")
|
||||
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
|
||||
etf_symbols = [row[0] for row in etf_cursor.fetchall()]
|
||||
|
||||
total_symbols = stock_symbols + etf_symbols
|
||||
total_symbols = ['SPY'] #stock_symbols + etf_symbols
|
||||
|
||||
query_template = """
|
||||
SELECT date, close,change_percent
|
||||
@ -385,7 +383,8 @@ for ticker in total_symbols:
|
||||
df_price = df_price.rename(columns={"change_percent": "changesPercentage"})
|
||||
|
||||
volatility = calculate_volatility(df_price)
|
||||
|
||||
print(df_price)
|
||||
print(volatility)
|
||||
ticker_data = get_data(ticker)
|
||||
# Group ticker_data by 'date' and collect all items for each date
|
||||
grouped_history = defaultdict(list)
|
||||
|
||||
Binary file not shown.
@ -5,6 +5,8 @@ import numpy as np
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.decomposition import PCA # Import PCA
|
||||
import lightgbm as lgb
|
||||
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
@ -17,30 +19,47 @@ import time
|
||||
class ScorePredictor:
|
||||
def __init__(self):
|
||||
self.scaler = MinMaxScaler()
|
||||
self.pca = PCA(n_components=5)
|
||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||
self.model = XGBClassifier(
|
||||
self.model = lgb.LGBMClassifier(
|
||||
n_estimators=20_000, # If you want to use a larger model we've found 20_000 trees to be better
|
||||
learning_rate=0.01, # and a learning rate of 0.001
|
||||
max_depth=20, # and max_depth=6
|
||||
num_leaves=2**6-1, # and num_leaves of 2**6-1
|
||||
colsample_bytree=0.1
|
||||
)
|
||||
'''
|
||||
XGBClassifier(
|
||||
n_estimators=200,
|
||||
max_depth=10,
|
||||
max_depth=5,
|
||||
learning_rate=0.1,
|
||||
random_state=42,
|
||||
n_jobs=10
|
||||
)
|
||||
'''
|
||||
|
||||
def preprocess_data(self, X):
|
||||
def preprocess_train_data(self, X):
|
||||
"""Preprocess training data by scaling and applying PCA."""
|
||||
X = np.where(np.isinf(X), np.nan, X)
|
||||
X = np.nan_to_num(X)
|
||||
X = self.scaler.fit_transform(X)
|
||||
return X
|
||||
X = self.scaler.fit_transform(X) # Transform using the fitted scaler
|
||||
return X#self.pca.fit_transform(X) # Fit PCA and transform
|
||||
|
||||
def preprocess_test_data(self, X):
|
||||
"""Preprocess test data by scaling and applying PCA."""
|
||||
X = np.where(np.isinf(X), np.nan, X)
|
||||
X = np.nan_to_num(X)
|
||||
X = self.scaler.transform(X) # Transform using the fitted scaler
|
||||
return X#self.pca.transform(X) # Transform using the fitted PCA
|
||||
|
||||
def warm_start_training(self, X_train, y_train):
|
||||
X_train = self.preprocess_data(X_train)
|
||||
X_train = self.preprocess_train_data(X_train)
|
||||
self.model.fit(X_train, y_train)
|
||||
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
|
||||
print("Warm start model saved.")
|
||||
|
||||
def fine_tune_model(self, X_train, y_train):
|
||||
X_train = self.preprocess_data(X_train)
|
||||
|
||||
X_train = self.preprocess_train_data(X_train)
|
||||
with open(f'{self.warm_start_model_path}', 'rb') as f:
|
||||
self.model = pickle.load(f)
|
||||
|
||||
@ -49,7 +68,7 @@ class ScorePredictor:
|
||||
|
||||
|
||||
def evaluate_model(self, X_test, y_test):
|
||||
X_test = self.preprocess_data(X_test)
|
||||
X_test = self.preprocess_test_data(X_test)
|
||||
|
||||
test_predictions = self.model.predict_proba(X_test)
|
||||
class_1_probabilities = test_predictions[:, 1]
|
||||
@ -57,10 +76,17 @@ class ScorePredictor:
|
||||
#print(test_predictions)
|
||||
test_precision = precision_score(y_test, binary_predictions)
|
||||
test_accuracy = accuracy_score(y_test, binary_predictions)
|
||||
test_f1_score = f1_score(y_test, binary_predictions)
|
||||
test_recall_score = recall_score(y_test, binary_predictions)
|
||||
test_roc_auc_score = roc_auc_score(y_test, binary_predictions)
|
||||
|
||||
print("Test Set Metrics:")
|
||||
print(f"Precision: {round(test_precision * 100)}%")
|
||||
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
||||
print(f"F1 Score: {round(test_f1_score * 100)}%")
|
||||
print(f"Recall Score: {round(test_recall_score * 100)}%")
|
||||
print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%")
|
||||
|
||||
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
|
||||
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
||||
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||
@ -75,5 +101,8 @@ class ScorePredictor:
|
||||
break
|
||||
|
||||
return {'accuracy': round(test_accuracy * 100),
|
||||
'precision': round(test_precision * 100),
|
||||
'precision': round(test_precision * 100),
|
||||
'f1_score': round(test_f1_score * 100),
|
||||
'recall_score': round(test_recall_score * 100),
|
||||
'roc_auc_score': round(test_roc_auc_score * 100),
|
||||
'score': score}
|
||||
Loading…
x
Reference in New Issue
Block a user