bugfixing

This commit is contained in:
MuslemRahimi 2024-10-08 10:59:25 +02:00
parent 80d95da43b
commit 57a18fbf9e
6 changed files with 24 additions and 86 deletions

View File

@ -80,7 +80,7 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75)
selected_features.append(feature) selected_features.append(feature)
return selected_features return selected_features
async def download_data(ticker, con, start_date, end_date, skip_downloading): async def download_data(ticker, con, start_date, end_date, skip_downloading, save_data):
file_path = f"ml_models/training_data/ai-score/{ticker}.json" file_path = f"ml_models/training_data/ai-score/{ticker}.json"
@ -200,6 +200,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt', 'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
] ]
# Function to compute combinations within a group # Function to compute combinations within a group
def compute_column_ratios(columns, df, new_columns): def compute_column_ratios(columns, df, new_columns):
@ -240,7 +241,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x) df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x)
# Save to a file if there are rows in the DataFrame # Save to a file if there are rows in the DataFrame
if not df_copy.empty: if not df_copy.empty and save_data == True:
with open(file_path, 'wb') as file: with open(file_path, 'wb') as file:
file.write(orjson.dumps(df_copy.to_dict(orient='records'))) file.write(orjson.dumps(df_copy.to_dict(orient='records')))
@ -251,7 +252,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
pass pass
async def chunked_gather(tickers, con, skip_downloading, chunk_size): async def chunked_gather(tickers, con, skip_downloading, save_data, chunk_size):
test_size = 0.2 test_size = 0.2
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d")
@ -267,7 +268,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
for chunk in tqdm(chunks(tickers, chunk_size)): for chunk in tqdm(chunks(tickers, chunk_size)):
# Create tasks for each chunk # Create tasks for each chunk
print(f"chunk size: {len(chunk)}") print(f"chunk size: {len(chunk)}")
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] tasks = [download_data(ticker, con, start_date, end_date, skip_downloading, save_data) for ticker in chunk]
# Await the results for the current chunk # Await the results for the current chunk
chunk_results = await asyncio.gather(*tasks) chunk_results = await asyncio.gather(*tasks)
@ -309,18 +310,18 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
print(f'Overall Evaluation Metrics: {data}') print(f'Overall Evaluation Metrics: {data}')
async def warm_start_training(tickers, con, skip_downloading): async def warm_start_training(tickers, con, skip_downloading, save_data):
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) dfs = await chunked_gather(tickers, con, skip_downloading, save_data, chunk_size=100)
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading): async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data):
try: try:
df_train = pd.DataFrame() df_train = pd.DataFrame()
df_test_dict = {} # Store test data for each ticker df_test_dict = {} # Store test data for each ticker
all_test_data = [] # Store all test data for overall evaluation all_test_data = [] # Store all test data for overall evaluation
df = await download_data(ticker, con, start_date, end_date, skip_downloading) df = await download_data(ticker, con, start_date, end_date, skip_downloading, save_data)
split_size = int(len(df) * (1 - test_size)) split_size = int(len(df) * (1 - test_size))
df_train = df.iloc[:split_size] df_train = df.iloc[:split_size]
df_test = df.iloc[split_size:] df_test = df.iloc[split_size:]
@ -345,22 +346,24 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, s
# Save the evaluation data to a JSON file # Save the evaluation data to a JSON file
await save_json(ticker, data) await save_json(ticker, data)
print(f"Saved results for {ticker}") print(f"Saved results for {ticker}")
except: except Exception as e:
print(e)
pass pass
async def run(): async def run():
train_mode = False # Set this to False for fine-tuning and evaluation train_mode = False # Set this to False for fine-tuning and evaluation
skip_downloading = False skip_downloading = False
save_data = train_mode
con = sqlite3.connect('stocks.db') con = sqlite3.connect('stocks.db')
cursor = con.cursor() cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal") cursor.execute("PRAGMA journal_mode = wal")
if train_mode: if train_mode:
# Warm start training # Warm start training
warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) warm_start_symbols = list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
await warm_start_training(warm_start_symbols, con, skip_downloading) await warm_start_training(warm_start_symbols, con, skip_downloading, save_data)
else: else:
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d")
@ -374,7 +377,7 @@ async def run():
""") """)
stock_symbols = [row[0] for row in cursor.fetchall()] stock_symbols = [row[0] for row in cursor.fetchall()]
for ticker in tqdm(stock_symbols): for ticker in tqdm(stock_symbols):
await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading) await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data)
con.close() con.close()

View File

@ -367,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal")
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs") etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
etf_symbols = [row[0] for row in etf_cursor.fetchall()] etf_symbols = [row[0] for row in etf_cursor.fetchall()]
total_symbols = ['SPY'] #stock_symbols + etf_symbols total_symbols = stock_symbols + etf_symbols
query_template = """ query_template = """
SELECT date, close,change_percent SELECT date, close,change_percent

View File

@ -1,14 +1,8 @@
import pandas as pd import pandas as pd
from datetime import datetime, timedelta from datetime import datetime, timedelta
import numpy as np import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
import lightgbm as lgb import lightgbm as lgb
@ -24,52 +18,19 @@ import os
class ScorePredictor: class ScorePredictor:
def __init__(self): def __init__(self):
self.scaler = StandardScaler() self.scaler = MinMaxScaler()
self.pca = PCA(n_components=0.95) self.pca = PCA(n_components=0.95)
# Define base models # Define base models
self.xgb_model = XGBClassifier(
n_estimators=100,
max_depth=10,
learning_rate=0.001,
random_state=42,
n_jobs=10,
tree_method='gpu_hist',
)
'''
self.lgb_model = lgb.LGBMClassifier( self.lgb_model = lgb.LGBMClassifier(
n_estimators=100, n_estimators=2000,
learning_rate=0.001, learning_rate=0.001,
max_depth=10, max_depth=5,
num_leaves=2**5-1,
n_jobs=10 n_jobs=10
) )
'''
self.rf_model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42,
n_jobs=10
)
self.svc_model = SVC(probability=True, kernel='rbf')
self.knn_model = KNeighborsClassifier(n_neighbors=5)
self.nb_model = GaussianNB()
# Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner
self.model = StackingClassifier(
estimators=[
('xgb', self.xgb_model),
#('lgb', self.lgb_model),
('rf', self.rf_model),
('svc', self.svc_model),
('knn', self.knn_model),
('nb', self.nb_model)
],
final_estimator=LogisticRegression(),
n_jobs=10
)
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl' self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
def preprocess_train_data(self, X): def preprocess_train_data(self, X):
@ -87,8 +48,8 @@ class ScorePredictor:
def warm_start_training(self, X_train, y_train): def warm_start_training(self, X_train, y_train):
X_train = self.preprocess_train_data(X_train) X_train = self.preprocess_train_data(X_train)
if os.path.exists(self.warm_start_model_path): if os.path.exists(self.warm_start_model_path):
with open(self.warm_start_model_path, 'rb') as f: os.remove(self.warm_start_model_path)
self.model = pickle.load(f)
self.model.fit(X_train, y_train) self.model.fit(X_train, y_train)
pickle.dump(self.model, open(self.warm_start_model_path, 'wb')) pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
print("Warm start model saved.") print("Warm start model saved.")
@ -121,6 +82,7 @@ class ScorePredictor:
print(f"ROC AUC: {round(test_roc_auc_score * 100)}%") print(f"ROC AUC: {round(test_roc_auc_score * 100)}%")
last_prediction_prob = class_1_probabilities[-1] last_prediction_prob = class_1_probabilities[-1]
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
print(f"Last prediction probability: {last_prediction_prob}") print(f"Last prediction probability: {last_prediction_prob}")
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0] thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]

View File

@ -177,33 +177,6 @@ def generate_statistical_features(df, windows=[20,50,200], price_col='close',
df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew() df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew()
df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt() df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt()
# Price-volume correlations
df_features[f'price_volume_corr_{window}'] = (
df[price_col].rolling(window=window)
.corr(df[volume_col]))
# Higher-order moments of returns
returns = df[price_col].pct_change()
df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew()
df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt()
# Cross-sectional statistics
df_features['price_acceleration'] = df[price_col].diff().diff()
df_features['returns_acceleration'] = df[price_col].pct_change().diff()
# Advanced volatility estimators
df_features['parkinson_vol'] = np.sqrt(
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
df_features['garman_klass_vol'] = np.sqrt(
0.5 * np.log(df[high_col]/df[low_col])**2 -
(2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2
)
# Dispersion measures
df_features['price_range'] = df[high_col] - df[low_col]
df_features['price_range_pct'] = df_features['price_range'] / df[price_col]
# Clean up any NaN values # Clean up any NaN values
df_features = df_features.dropna() df_features = df_features.dropna()