update model

This commit is contained in:
MuslemRahimi 2024-10-05 22:08:56 +02:00
parent 3e6ef8b540
commit 8521a4a404
4 changed files with 80 additions and 79 deletions

View File

@ -11,6 +11,7 @@ import pandas as pd
from tqdm import tqdm from tqdm import tqdm
import concurrent.futures import concurrent.futures
import re import re
import random
from itertools import combinations from itertools import combinations
from dotenv import load_dotenv from dotenv import load_dotenv
@ -41,6 +42,8 @@ async def fetch_historical_price(ticker):
historical_data = data.get('historical', []) historical_data = data.get('historical', [])
# Convert to DataFrame # Convert to DataFrame
df = pd.DataFrame(historical_data).reset_index(drop=True) df = pd.DataFrame(historical_data).reset_index(drop=True)
# Reverse the DataFrame so that the past dates are first
df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
return df return df
else: else:
raise Exception(f"Error fetching data: {response.status} {response.reason}") raise Exception(f"Error fetching data: {response.status} {response.reason}")
@ -82,8 +85,11 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
file_path = f"ml_models/training_data/ai-score/{ticker}.json" file_path = f"ml_models/training_data/ai-score/{ticker}.json"
if os.path.exists(file_path): if os.path.exists(file_path):
with open(file_path, 'rb') as file: try:
return pd.DataFrame(orjson.loads(file.read())) with open(file_path, 'rb') as file:
return pd.DataFrame(orjson.loads(file.read()))
except:
return pd.DataFrame()
elif skip_downloading == False: elif skip_downloading == False:
try: try:
@ -176,15 +182,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
item['price'] = round(data['close'], 2) item['price'] = round(data['close'], 2)
# Dynamically add indicator values from ta_columns and stats_columns # Dynamically add indicator values from ta_columns and stats_columns
for column in ta_columns + stats_columns: for column in ta_columns+stats_columns:
item[column] = data.get(column, None) item[column] = data.get(column, None)
# Sort the combined data by date # Sort the combined data by date
combined_data = sorted(combined_data, key=lambda x: x['date']) combined_data = sorted(combined_data, key=lambda x: x['date'])
# Convert combined data to a DataFrame and drop rows with NaN values # Convert combined data to a DataFrame and drop rows with NaN values
df_combined = pd.DataFrame(combined_data).dropna() df_combined = pd.DataFrame(combined_data).dropna()
fundamental_columns = [ fundamental_columns = [
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses', 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
@ -248,94 +252,96 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
pass pass
async def chunked_gather(tickers, con, skip_downloading, chunk_size=10): async def chunked_gather(tickers, con, skip_downloading, chunk_size):
test_size = 0.2 test_size = 0.2
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d")
df_train = pd.DataFrame() df_train = pd.DataFrame()
df_test = pd.DataFrame() df_test_dict = {} # Store test data for each ticker
all_test_data = [] # Store all test data for overall evaluation
# Helper function to divide the tickers into chunks # Helper function to divide the tickers into chunks
def chunks(lst, size): def chunks(lst, size):
for i in range(0, len(lst), size): for i in range(0, len(lst), size):
yield lst[i:i+size] yield lst[i:i + size]
dfs = [] for chunk in tqdm(chunks(tickers, chunk_size)):
for num, chunk in enumerate(tqdm(chunks(tickers, chunk_size))):
# Create tasks for each chunk # Create tasks for each chunk
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
# Await the results for the current chunk # Await the results for the current chunk
chunk_results = await asyncio.gather(*tasks) chunk_results = await asyncio.gather(*tasks)
# Accumulate the results
dfs.extend(chunk_results)
train_list = [] train_list = []
test_list = []
for df in dfs: for ticker, df in zip(chunk, chunk_results):
try: try:
# Split the data into training and testing sets
split_size = int(len(df) * (1 - test_size)) split_size = int(len(df) * (1 - test_size))
train_data = df.iloc[:split_size] train_data = df.iloc[:split_size]
test_data = df.iloc[split_size:] test_data = df.iloc[split_size:]
# Store test data for this ticker in a dictionary
df_test_dict[ticker] = test_data
# Append to the lists # Append train data for combined training
train_list.append(train_data) train_list.append(train_data)
test_list.append(test_data)
# Collect all test data for overall evaluation
all_test_data.append(test_data)
except: except:
pass pass
# Concatenate all at once outside the loop # Concatenate all train data together
df_train = pd.concat(train_list, ignore_index=True) if train_list:
df_test = pd.concat(test_list, ignore_index=True) df_train = pd.concat(train_list, ignore_index=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
print('======Warm Start Train Set Datapoints======') # Shuffle the combined training data
print(f'Batch Training: {num}') df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
print(len(df_train))
print('====== Start Training Model on Combined Data ======')
predictor = ScorePredictor() predictor = ScorePredictor()
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200) selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
# Train the model on the combined training data
predictor.warm_start_training(df_train[selected_features], df_train['Target']) predictor.warm_start_training(df_train[selected_features], df_train['Target'])
predictor.evaluate_model(df_test[selected_features], df_test['Target']) print(f'Training complete on {len(df_train)} samples.')
# Evaluate the model on the overall test dataset
if all_test_data:
overall_test_data = pd.concat(all_test_data, ignore_index=True)
print('====== Evaluating on Overall Test Dataset ======')
overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
# Evaluate the model for each ticker separately
for ticker, test_data in df_test_dict.items():
try:
print(f"Fine-tuning the model for {ticker}")
predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
print(f"Evaluating model for {ticker}")
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
# Check if the evaluation data meets the criteria
'''
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
data['accuracy'] < 100 and data['precision'] < 100 and
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
data['roc_auc_score'] >= 50):
'''
# Save the evaluation data to a JSON file
await save_json(ticker, data)
print(f"Saved results for {ticker}")
except Exception as e:
print(e)
pass
async def warm_start_training(tickers, con, skip_downloading): async def warm_start_training(tickers, con, skip_downloading):
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=50) dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading):
try:
df = await download_data(ticker,con, start_date, end_date, skip_downloading)
if df is None or len(df) == 0:
print(f"No data available for {ticker}")
return
test_size = 0.2
split_size = int(len(df) * (1-test_size))
train_data = df.iloc[:split_size]
test_data = df.iloc[split_size:]
selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
# Fine-tune the model
predictor = ScorePredictor()
predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
print(f"Evaluating fine-tuned model for {ticker}")
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
if len(data) != 0:
if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50:
await save_json(ticker, data)
print(f"Saved results for {ticker}")
gc.collect()
except Exception as e:
print(f"Error processing {ticker}: {e}")
finally:
# Ensure any remaining cleanup if necessary
if 'predictor' in locals():
del predictor # Explicitly delete the predictor to aid garbage collection
async def run(): async def run():
train_mode = True # Set this to False for fine-tuning and evaluation train_mode = True # Set this to False for fine-tuning and evaluation
@ -346,22 +352,18 @@ async def run():
if train_mode: if train_mode:
# Warm start training # Warm start training
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") cursor.execute("""
warm_start_symbols = [row[0] for row in cursor.fetchall()] SELECT DISTINCT symbol
FROM stocks
WHERE marketCap >= 500E6
AND symbol NOT LIKE '%.%'
AND symbol NOT LIKE '%-%'
ORDER BY marketCap DESC;
""")
warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
await warm_start_training(warm_start_symbols, con, skip_downloading) await warm_start_training(warm_start_symbols, con, skip_downloading)
else:
# Fine-tuning and evaluation for all stocks
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in cursor.fetchall()]
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d")
tasks = []
for ticker in tqdm(stock_symbols):
await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading)
con.close() con.close()

View File

@ -13,9 +13,8 @@ import asyncio
import aiohttp import aiohttp
import aiofiles import aiofiles
import pickle import pickle
import os
import time import time
import os
class ScorePredictor: class ScorePredictor:
@ -24,10 +23,10 @@ class ScorePredictor:
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
self.model = lgb.LGBMClassifier( self.model = lgb.LGBMClassifier(
n_estimators=200, # Number of boosting iterations - good balance between performance and training time n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time
learning_rate=0.005, # Smaller learning rate for better generalization learning_rate=0.005, # Smaller learning rate for better generalization
max_depth=8, # Controlled depth to prevent overfitting max_depth=12, # Controlled depth to prevent overfitting
num_leaves=32, # 2^max_depth, prevents overfitting while maintaining model complexity num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
subsample=0.8, # Use 80% of data per tree to reduce overfitting subsample=0.8, # Use 80% of data per tree to reduce overfitting
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits min_child_samples=20, # Minimum samples per leaf to ensure reliable splits