bugfixing
This commit is contained in:
parent
80d95da43b
commit
57a18fbf9e
@ -80,7 +80,7 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75)
|
|||||||
selected_features.append(feature)
|
selected_features.append(feature)
|
||||||
return selected_features
|
return selected_features
|
||||||
|
|
||||||
async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
async def download_data(ticker, con, start_date, end_date, skip_downloading, save_data):
|
||||||
|
|
||||||
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
||||||
|
|
||||||
@ -201,6 +201,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
|
'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# Function to compute combinations within a group
|
# Function to compute combinations within a group
|
||||||
def compute_column_ratios(columns, df, new_columns):
|
def compute_column_ratios(columns, df, new_columns):
|
||||||
column_combinations = list(combinations(columns, 2))
|
column_combinations = list(combinations(columns, 2))
|
||||||
@ -240,7 +241,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x)
|
df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x)
|
||||||
|
|
||||||
# Save to a file if there are rows in the DataFrame
|
# Save to a file if there are rows in the DataFrame
|
||||||
if not df_copy.empty:
|
if not df_copy.empty and save_data == True:
|
||||||
with open(file_path, 'wb') as file:
|
with open(file_path, 'wb') as file:
|
||||||
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
||||||
|
|
||||||
@ -251,7 +252,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
async def chunked_gather(tickers, con, skip_downloading, save_data, chunk_size):
|
||||||
test_size = 0.2
|
test_size = 0.2
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
@ -267,7 +268,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
for chunk in tqdm(chunks(tickers, chunk_size)):
|
for chunk in tqdm(chunks(tickers, chunk_size)):
|
||||||
# Create tasks for each chunk
|
# Create tasks for each chunk
|
||||||
print(f"chunk size: {len(chunk)}")
|
print(f"chunk size: {len(chunk)}")
|
||||||
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
|
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading, save_data) for ticker in chunk]
|
||||||
# Await the results for the current chunk
|
# Await the results for the current chunk
|
||||||
chunk_results = await asyncio.gather(*tasks)
|
chunk_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
@ -309,18 +310,18 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
print(f'Overall Evaluation Metrics: {data}')
|
print(f'Overall Evaluation Metrics: {data}')
|
||||||
|
|
||||||
|
|
||||||
async def warm_start_training(tickers, con, skip_downloading):
|
async def warm_start_training(tickers, con, skip_downloading, save_data):
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
|
dfs = await chunked_gather(tickers, con, skip_downloading, save_data, chunk_size=100)
|
||||||
|
|
||||||
|
|
||||||
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading):
|
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data):
|
||||||
try:
|
try:
|
||||||
df_train = pd.DataFrame()
|
df_train = pd.DataFrame()
|
||||||
df_test_dict = {} # Store test data for each ticker
|
df_test_dict = {} # Store test data for each ticker
|
||||||
all_test_data = [] # Store all test data for overall evaluation
|
all_test_data = [] # Store all test data for overall evaluation
|
||||||
|
|
||||||
df = await download_data(ticker, con, start_date, end_date, skip_downloading)
|
df = await download_data(ticker, con, start_date, end_date, skip_downloading, save_data)
|
||||||
split_size = int(len(df) * (1 - test_size))
|
split_size = int(len(df) * (1 - test_size))
|
||||||
df_train = df.iloc[:split_size]
|
df_train = df.iloc[:split_size]
|
||||||
df_test = df.iloc[split_size:]
|
df_test = df.iloc[split_size:]
|
||||||
@ -345,22 +346,24 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, s
|
|||||||
# Save the evaluation data to a JSON file
|
# Save the evaluation data to a JSON file
|
||||||
await save_json(ticker, data)
|
await save_json(ticker, data)
|
||||||
print(f"Saved results for {ticker}")
|
print(f"Saved results for {ticker}")
|
||||||
except:
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
train_mode = False # Set this to False for fine-tuning and evaluation
|
train_mode = False # Set this to False for fine-tuning and evaluation
|
||||||
skip_downloading = False
|
skip_downloading = False
|
||||||
|
save_data = train_mode
|
||||||
con = sqlite3.connect('stocks.db')
|
con = sqlite3.connect('stocks.db')
|
||||||
cursor = con.cursor()
|
cursor = con.cursor()
|
||||||
cursor.execute("PRAGMA journal_mode = wal")
|
cursor.execute("PRAGMA journal_mode = wal")
|
||||||
|
|
||||||
if train_mode:
|
if train_mode:
|
||||||
# Warm start training
|
# Warm start training
|
||||||
warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
|
warm_start_symbols = list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO']))
|
||||||
|
|
||||||
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
||||||
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
await warm_start_training(warm_start_symbols, con, skip_downloading, save_data)
|
||||||
else:
|
else:
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
@ -374,7 +377,7 @@ async def run():
|
|||||||
""")
|
""")
|
||||||
stock_symbols = [row[0] for row in cursor.fetchall()]
|
stock_symbols = [row[0] for row in cursor.fetchall()]
|
||||||
for ticker in tqdm(stock_symbols):
|
for ticker in tqdm(stock_symbols):
|
||||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading)
|
await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data)
|
||||||
|
|
||||||
|
|
||||||
con.close()
|
con.close()
|
||||||
|
|||||||
@ -367,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal")
|
|||||||
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
|
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
|
||||||
etf_symbols = [row[0] for row in etf_cursor.fetchall()]
|
etf_symbols = [row[0] for row in etf_cursor.fetchall()]
|
||||||
|
|
||||||
total_symbols = ['SPY'] #stock_symbols + etf_symbols
|
total_symbols = stock_symbols + etf_symbols
|
||||||
|
|
||||||
query_template = """
|
query_template = """
|
||||||
SELECT date, close,change_percent
|
SELECT date, close,change_percent
|
||||||
|
|||||||
Binary file not shown.
@ -1,14 +1,8 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from xgboost import XGBClassifier
|
|
||||||
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
|
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
|
||||||
from sklearn.naive_bayes import GaussianNB
|
|
||||||
from sklearn.svm import SVC
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
import lightgbm as lgb
|
import lightgbm as lgb
|
||||||
|
|
||||||
@ -24,51 +18,18 @@ import os
|
|||||||
|
|
||||||
class ScorePredictor:
|
class ScorePredictor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scaler = StandardScaler()
|
self.scaler = MinMaxScaler()
|
||||||
self.pca = PCA(n_components=0.95)
|
self.pca = PCA(n_components=0.95)
|
||||||
|
|
||||||
# Define base models
|
# Define base models
|
||||||
self.xgb_model = XGBClassifier(
|
|
||||||
n_estimators=100,
|
|
||||||
max_depth=10,
|
|
||||||
learning_rate=0.001,
|
|
||||||
random_state=42,
|
|
||||||
n_jobs=10,
|
|
||||||
tree_method='gpu_hist',
|
|
||||||
)
|
|
||||||
'''
|
|
||||||
self.lgb_model = lgb.LGBMClassifier(
|
self.lgb_model = lgb.LGBMClassifier(
|
||||||
n_estimators=100,
|
n_estimators=2000,
|
||||||
learning_rate=0.001,
|
learning_rate=0.001,
|
||||||
max_depth=10,
|
max_depth=5,
|
||||||
n_jobs=10
|
num_leaves=2**5-1,
|
||||||
)
|
|
||||||
'''
|
|
||||||
self.rf_model = RandomForestClassifier(
|
|
||||||
n_estimators=100,
|
|
||||||
max_depth=10,
|
|
||||||
random_state=42,
|
|
||||||
n_jobs=10
|
n_jobs=10
|
||||||
)
|
)
|
||||||
|
|
||||||
self.svc_model = SVC(probability=True, kernel='rbf')
|
|
||||||
self.knn_model = KNeighborsClassifier(n_neighbors=5)
|
|
||||||
self.nb_model = GaussianNB()
|
|
||||||
|
|
||||||
|
|
||||||
# Stacking ensemble (XGBoost + LightGBM) with Logistic Regression as meta-learner
|
|
||||||
self.model = StackingClassifier(
|
|
||||||
estimators=[
|
|
||||||
('xgb', self.xgb_model),
|
|
||||||
#('lgb', self.lgb_model),
|
|
||||||
('rf', self.rf_model),
|
|
||||||
('svc', self.svc_model),
|
|
||||||
('knn', self.knn_model),
|
|
||||||
('nb', self.nb_model)
|
|
||||||
],
|
|
||||||
final_estimator=LogisticRegression(),
|
|
||||||
n_jobs=10
|
|
||||||
)
|
|
||||||
|
|
||||||
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
|
self.warm_start_model_path = 'ml_models/weights/ai-score/stacking_weights.pkl'
|
||||||
|
|
||||||
@ -87,8 +48,8 @@ class ScorePredictor:
|
|||||||
def warm_start_training(self, X_train, y_train):
|
def warm_start_training(self, X_train, y_train):
|
||||||
X_train = self.preprocess_train_data(X_train)
|
X_train = self.preprocess_train_data(X_train)
|
||||||
if os.path.exists(self.warm_start_model_path):
|
if os.path.exists(self.warm_start_model_path):
|
||||||
with open(self.warm_start_model_path, 'rb') as f:
|
os.remove(self.warm_start_model_path)
|
||||||
self.model = pickle.load(f)
|
|
||||||
self.model.fit(X_train, y_train)
|
self.model.fit(X_train, y_train)
|
||||||
pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
|
pickle.dump(self.model, open(self.warm_start_model_path, 'wb'))
|
||||||
print("Warm start model saved.")
|
print("Warm start model saved.")
|
||||||
@ -121,6 +82,7 @@ class ScorePredictor:
|
|||||||
print(f"ROC AUC: {round(test_roc_auc_score * 100)}%")
|
print(f"ROC AUC: {round(test_roc_auc_score * 100)}%")
|
||||||
|
|
||||||
last_prediction_prob = class_1_probabilities[-1]
|
last_prediction_prob = class_1_probabilities[-1]
|
||||||
|
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
|
||||||
print(f"Last prediction probability: {last_prediction_prob}")
|
print(f"Last prediction probability: {last_prediction_prob}")
|
||||||
|
|
||||||
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
||||||
|
|||||||
Binary file not shown.
@ -177,33 +177,6 @@ def generate_statistical_features(df, windows=[20,50,200], price_col='close',
|
|||||||
df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew()
|
df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew()
|
||||||
df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt()
|
df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt()
|
||||||
|
|
||||||
# Price-volume correlations
|
|
||||||
df_features[f'price_volume_corr_{window}'] = (
|
|
||||||
df[price_col].rolling(window=window)
|
|
||||||
.corr(df[volume_col]))
|
|
||||||
|
|
||||||
# Higher-order moments of returns
|
|
||||||
returns = df[price_col].pct_change()
|
|
||||||
df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew()
|
|
||||||
df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt()
|
|
||||||
|
|
||||||
# Cross-sectional statistics
|
|
||||||
df_features['price_acceleration'] = df[price_col].diff().diff()
|
|
||||||
df_features['returns_acceleration'] = df[price_col].pct_change().diff()
|
|
||||||
|
|
||||||
# Advanced volatility estimators
|
|
||||||
df_features['parkinson_vol'] = np.sqrt(
|
|
||||||
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
|
|
||||||
|
|
||||||
df_features['garman_klass_vol'] = np.sqrt(
|
|
||||||
0.5 * np.log(df[high_col]/df[low_col])**2 -
|
|
||||||
(2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2
|
|
||||||
)
|
|
||||||
|
|
||||||
# Dispersion measures
|
|
||||||
df_features['price_range'] = df[high_col] - df[low_col]
|
|
||||||
df_features['price_range_pct'] = df_features['price_range'] / df[price_col]
|
|
||||||
|
|
||||||
# Clean up any NaN values
|
# Clean up any NaN values
|
||||||
df_features = df_features.dropna()
|
df_features = df_features.dropna()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user