update model
This commit is contained in:
parent
3e6ef8b540
commit
8521a4a404
@ -11,6 +11,7 @@ import pandas as pd
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import re
|
import re
|
||||||
|
import random
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
@ -41,6 +42,8 @@ async def fetch_historical_price(ticker):
|
|||||||
historical_data = data.get('historical', [])
|
historical_data = data.get('historical', [])
|
||||||
# Convert to DataFrame
|
# Convert to DataFrame
|
||||||
df = pd.DataFrame(historical_data).reset_index(drop=True)
|
df = pd.DataFrame(historical_data).reset_index(drop=True)
|
||||||
|
# Reverse the DataFrame so that the past dates are first
|
||||||
|
df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
|
||||||
return df
|
return df
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Error fetching data: {response.status} {response.reason}")
|
raise Exception(f"Error fetching data: {response.status} {response.reason}")
|
||||||
@ -82,8 +85,11 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
||||||
|
|
||||||
if os.path.exists(file_path):
|
if os.path.exists(file_path):
|
||||||
with open(file_path, 'rb') as file:
|
try:
|
||||||
return pd.DataFrame(orjson.loads(file.read()))
|
with open(file_path, 'rb') as file:
|
||||||
|
return pd.DataFrame(orjson.loads(file.read()))
|
||||||
|
except:
|
||||||
|
return pd.DataFrame()
|
||||||
elif skip_downloading == False:
|
elif skip_downloading == False:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -176,15 +182,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
item['price'] = round(data['close'], 2)
|
item['price'] = round(data['close'], 2)
|
||||||
|
|
||||||
# Dynamically add indicator values from ta_columns and stats_columns
|
# Dynamically add indicator values from ta_columns and stats_columns
|
||||||
for column in ta_columns + stats_columns:
|
for column in ta_columns+stats_columns:
|
||||||
item[column] = data.get(column, None)
|
item[column] = data.get(column, None)
|
||||||
|
|
||||||
# Sort the combined data by date
|
# Sort the combined data by date
|
||||||
combined_data = sorted(combined_data, key=lambda x: x['date'])
|
combined_data = sorted(combined_data, key=lambda x: x['date'])
|
||||||
|
|
||||||
# Convert combined data to a DataFrame and drop rows with NaN values
|
# Convert combined data to a DataFrame and drop rows with NaN values
|
||||||
df_combined = pd.DataFrame(combined_data).dropna()
|
df_combined = pd.DataFrame(combined_data).dropna()
|
||||||
|
|
||||||
|
|
||||||
fundamental_columns = [
|
fundamental_columns = [
|
||||||
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
|
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
|
||||||
@ -248,94 +252,96 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def chunked_gather(tickers, con, skip_downloading, chunk_size=10):
|
async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
||||||
test_size = 0.2
|
test_size = 0.2
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
df_train = pd.DataFrame()
|
df_train = pd.DataFrame()
|
||||||
df_test = pd.DataFrame()
|
df_test_dict = {} # Store test data for each ticker
|
||||||
|
all_test_data = [] # Store all test data for overall evaluation
|
||||||
|
|
||||||
# Helper function to divide the tickers into chunks
|
# Helper function to divide the tickers into chunks
|
||||||
def chunks(lst, size):
|
def chunks(lst, size):
|
||||||
for i in range(0, len(lst), size):
|
for i in range(0, len(lst), size):
|
||||||
yield lst[i:i+size]
|
yield lst[i:i + size]
|
||||||
|
|
||||||
dfs = []
|
for chunk in tqdm(chunks(tickers, chunk_size)):
|
||||||
|
|
||||||
for num, chunk in enumerate(tqdm(chunks(tickers, chunk_size))):
|
|
||||||
# Create tasks for each chunk
|
# Create tasks for each chunk
|
||||||
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
|
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
|
||||||
# Await the results for the current chunk
|
# Await the results for the current chunk
|
||||||
chunk_results = await asyncio.gather(*tasks)
|
chunk_results = await asyncio.gather(*tasks)
|
||||||
# Accumulate the results
|
|
||||||
dfs.extend(chunk_results)
|
|
||||||
|
|
||||||
train_list = []
|
train_list = []
|
||||||
test_list = []
|
|
||||||
|
|
||||||
for df in dfs:
|
for ticker, df in zip(chunk, chunk_results):
|
||||||
try:
|
try:
|
||||||
|
# Split the data into training and testing sets
|
||||||
split_size = int(len(df) * (1 - test_size))
|
split_size = int(len(df) * (1 - test_size))
|
||||||
train_data = df.iloc[:split_size]
|
train_data = df.iloc[:split_size]
|
||||||
test_data = df.iloc[split_size:]
|
test_data = df.iloc[split_size:]
|
||||||
|
|
||||||
|
# Store test data for this ticker in a dictionary
|
||||||
|
df_test_dict[ticker] = test_data
|
||||||
|
|
||||||
# Append to the lists
|
# Append train data for combined training
|
||||||
train_list.append(train_data)
|
train_list.append(train_data)
|
||||||
test_list.append(test_data)
|
|
||||||
|
# Collect all test data for overall evaluation
|
||||||
|
all_test_data.append(test_data)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Concatenate all at once outside the loop
|
# Concatenate all train data together
|
||||||
df_train = pd.concat(train_list, ignore_index=True)
|
if train_list:
|
||||||
df_test = pd.concat(test_list, ignore_index=True)
|
df_train = pd.concat(train_list, ignore_index=True)
|
||||||
df_train = df_train.sample(frac=1).reset_index(drop=True)
|
|
||||||
df_test = df_test.sample(frac=1).reset_index(drop=True)
|
|
||||||
|
|
||||||
print('======Warm Start Train Set Datapoints======')
|
# Shuffle the combined training data
|
||||||
print(f'Batch Training: {num}')
|
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
print(len(df_train))
|
|
||||||
|
|
||||||
|
print('====== Start Training Model on Combined Data ======')
|
||||||
predictor = ScorePredictor()
|
predictor = ScorePredictor()
|
||||||
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200)
|
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
|
||||||
|
|
||||||
|
# Train the model on the combined training data
|
||||||
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
|
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
|
||||||
predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
print(f'Training complete on {len(df_train)} samples.')
|
||||||
|
|
||||||
|
# Evaluate the model on the overall test dataset
|
||||||
|
if all_test_data:
|
||||||
|
overall_test_data = pd.concat(all_test_data, ignore_index=True)
|
||||||
|
print('====== Evaluating on Overall Test Dataset ======')
|
||||||
|
overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
|
||||||
|
print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
|
||||||
|
|
||||||
|
# Evaluate the model for each ticker separately
|
||||||
|
for ticker, test_data in df_test_dict.items():
|
||||||
|
try:
|
||||||
|
print(f"Fine-tuning the model for {ticker}")
|
||||||
|
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
|
||||||
|
|
||||||
|
print(f"Evaluating model for {ticker}")
|
||||||
|
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||||
|
|
||||||
|
# Check if the evaluation data meets the criteria
|
||||||
|
'''
|
||||||
|
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
||||||
|
data['accuracy'] < 100 and data['precision'] < 100 and
|
||||||
|
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
||||||
|
data['roc_auc_score'] >= 50):
|
||||||
|
'''
|
||||||
|
# Save the evaluation data to a JSON file
|
||||||
|
await save_json(ticker, data)
|
||||||
|
print(f"Saved results for {ticker}")
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def warm_start_training(tickers, con, skip_downloading):
|
async def warm_start_training(tickers, con, skip_downloading):
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=50)
|
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
|
||||||
|
|
||||||
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading):
|
|
||||||
try:
|
|
||||||
df = await download_data(ticker,con, start_date, end_date, skip_downloading)
|
|
||||||
if df is None or len(df) == 0:
|
|
||||||
print(f"No data available for {ticker}")
|
|
||||||
return
|
|
||||||
|
|
||||||
test_size = 0.2
|
|
||||||
split_size = int(len(df) * (1-test_size))
|
|
||||||
train_data = df.iloc[:split_size]
|
|
||||||
test_data = df.iloc[split_size:]
|
|
||||||
|
|
||||||
selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
|
|
||||||
# Fine-tune the model
|
|
||||||
predictor = ScorePredictor()
|
|
||||||
predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
|
|
||||||
|
|
||||||
print(f"Evaluating fine-tuned model for {ticker}")
|
|
||||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
|
||||||
|
|
||||||
if len(data) != 0:
|
|
||||||
if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50:
|
|
||||||
await save_json(ticker, data)
|
|
||||||
print(f"Saved results for {ticker}")
|
|
||||||
gc.collect()
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error processing {ticker}: {e}")
|
|
||||||
finally:
|
|
||||||
# Ensure any remaining cleanup if necessary
|
|
||||||
if 'predictor' in locals():
|
|
||||||
del predictor # Explicitly delete the predictor to aid garbage collection
|
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
train_mode = True # Set this to False for fine-tuning and evaluation
|
train_mode = True # Set this to False for fine-tuning and evaluation
|
||||||
@ -346,22 +352,18 @@ async def run():
|
|||||||
|
|
||||||
if train_mode:
|
if train_mode:
|
||||||
# Warm start training
|
# Warm start training
|
||||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
cursor.execute("""
|
||||||
warm_start_symbols = [row[0] for row in cursor.fetchall()]
|
SELECT DISTINCT symbol
|
||||||
|
FROM stocks
|
||||||
|
WHERE marketCap >= 500E6
|
||||||
|
AND symbol NOT LIKE '%.%'
|
||||||
|
AND symbol NOT LIKE '%-%'
|
||||||
|
ORDER BY marketCap DESC;
|
||||||
|
""")
|
||||||
|
warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
||||||
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
||||||
else:
|
|
||||||
# Fine-tuning and evaluation for all stocks
|
|
||||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
|
|
||||||
stock_symbols = [row[0] for row in cursor.fetchall()]
|
|
||||||
|
|
||||||
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
|
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
tasks = []
|
|
||||||
for ticker in tqdm(stock_symbols):
|
|
||||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading)
|
|
||||||
|
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@ -13,9 +13,8 @@ import asyncio
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
class ScorePredictor:
|
class ScorePredictor:
|
||||||
@ -24,10 +23,10 @@ class ScorePredictor:
|
|||||||
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
||||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||||
self.model = lgb.LGBMClassifier(
|
self.model = lgb.LGBMClassifier(
|
||||||
n_estimators=200, # Number of boosting iterations - good balance between performance and training time
|
n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time
|
||||||
learning_rate=0.005, # Smaller learning rate for better generalization
|
learning_rate=0.005, # Smaller learning rate for better generalization
|
||||||
max_depth=8, # Controlled depth to prevent overfitting
|
max_depth=12, # Controlled depth to prevent overfitting
|
||||||
num_leaves=32, # 2^max_depth, prevents overfitting while maintaining model complexity
|
num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity
|
||||||
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
||||||
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
||||||
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits
|
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user