From 8521a4a4046699e83e6a4eb5013e0602454f8acd Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Sat, 5 Oct 2024 22:08:56 +0200 Subject: [PATCH] update model --- app/cron_ai_score.py | 150 +++++++++--------- .../__pycache__/score_model.cpython-310.pyc | Bin 4004 -> 4004 bytes app/ml_models/score_model.py | 9 +- .../feature_engineering.cpython-310.pyc | Bin 6816 -> 6816 bytes 4 files changed, 80 insertions(+), 79 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 43f8b43..bbeaeae 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -11,6 +11,7 @@ import pandas as pd from tqdm import tqdm import concurrent.futures import re +import random from itertools import combinations from dotenv import load_dotenv @@ -41,6 +42,8 @@ async def fetch_historical_price(ticker): historical_data = data.get('historical', []) # Convert to DataFrame df = pd.DataFrame(historical_data).reset_index(drop=True) + # Reverse the DataFrame so that the past dates are first + df = df.sort_values(by='date', ascending=True).reset_index(drop=True) return df else: raise Exception(f"Error fetching data: {response.status} {response.reason}") @@ -82,8 +85,11 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): file_path = f"ml_models/training_data/ai-score/{ticker}.json" if os.path.exists(file_path): - with open(file_path, 'rb') as file: - return pd.DataFrame(orjson.loads(file.read())) + try: + with open(file_path, 'rb') as file: + return pd.DataFrame(orjson.loads(file.read())) + except: + return pd.DataFrame() elif skip_downloading == False: try: @@ -176,15 +182,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): item['price'] = round(data['close'], 2) # Dynamically add indicator values from ta_columns and stats_columns - for column in ta_columns + stats_columns: + for column in ta_columns+stats_columns: item[column] = data.get(column, None) # Sort the combined data by date combined_data = sorted(combined_data, key=lambda x: x['date']) - # Convert combined data to a DataFrame and drop rows with NaN values df_combined = pd.DataFrame(combined_data).dropna() - fundamental_columns = [ 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses', @@ -248,94 +252,96 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): pass -async def chunked_gather(tickers, con, skip_downloading, chunk_size=10): +async def chunked_gather(tickers, con, skip_downloading, chunk_size): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") df_train = pd.DataFrame() - df_test = pd.DataFrame() + df_test_dict = {} # Store test data for each ticker + all_test_data = [] # Store all test data for overall evaluation # Helper function to divide the tickers into chunks def chunks(lst, size): for i in range(0, len(lst), size): - yield lst[i:i+size] - - dfs = [] - - for num, chunk in enumerate(tqdm(chunks(tickers, chunk_size))): + yield lst[i:i + size] + + for chunk in tqdm(chunks(tickers, chunk_size)): # Create tasks for each chunk tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) - # Accumulate the results - dfs.extend(chunk_results) train_list = [] - test_list = [] - for df in dfs: + for ticker, df in zip(chunk, chunk_results): try: + # Split the data into training and testing sets split_size = int(len(df) * (1 - test_size)) train_data = df.iloc[:split_size] test_data = df.iloc[split_size:] + + # Store test data for this ticker in a dictionary + df_test_dict[ticker] = test_data - # Append to the lists + # Append train data for combined training train_list.append(train_data) - test_list.append(test_data) + + # Collect all test data for overall evaluation + all_test_data.append(test_data) + except: pass - # Concatenate all at once outside the loop - df_train = pd.concat(train_list, ignore_index=True) - df_test = pd.concat(test_list, ignore_index=True) - df_train = df_train.sample(frac=1).reset_index(drop=True) - df_test = df_test.sample(frac=1).reset_index(drop=True) + # Concatenate all train data together + if train_list: + df_train = pd.concat(train_list, ignore_index=True) - print('======Warm Start Train Set Datapoints======') - print(f'Batch Training: {num}') - print(len(df_train)) + # Shuffle the combined training data + df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True) + print('====== Start Training Model on Combined Data ======') predictor = ScorePredictor() - selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200) + selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] + + # Train the model on the combined training data predictor.warm_start_training(df_train[selected_features], df_train['Target']) - predictor.evaluate_model(df_test[selected_features], df_test['Target']) + print(f'Training complete on {len(df_train)} samples.') + + # Evaluate the model on the overall test dataset + if all_test_data: + overall_test_data = pd.concat(all_test_data, ignore_index=True) + print('====== Evaluating on Overall Test Dataset ======') + overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target']) + print(f'Overall Evaluation Metrics: {overall_evaluation_data}') + + # Evaluate the model for each ticker separately + for ticker, test_data in df_test_dict.items(): + try: + print(f"Fine-tuning the model for {ticker}") + predictor.fine_tune_model(df_train[selected_features], df_train['Target']) + + print(f"Evaluating model for {ticker}") + data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) + + # Check if the evaluation data meets the criteria + ''' + if (data['precision'] >= 50 and data['accuracy'] >= 50 and + data['accuracy'] < 100 and data['precision'] < 100 and + data['f1_score'] >= 50 and data['recall_score'] >= 50 and + data['roc_auc_score'] >= 50): + ''' + # Save the evaluation data to a JSON file + await save_json(ticker, data) + print(f"Saved results for {ticker}") + except Exception as e: + print(e) + pass async def warm_start_training(tickers, con, skip_downloading): - dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=50) + dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220) -async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading): - try: - df = await download_data(ticker,con, start_date, end_date, skip_downloading) - if df is None or len(df) == 0: - print(f"No data available for {ticker}") - return - - test_size = 0.2 - split_size = int(len(df) * (1-test_size)) - train_data = df.iloc[:split_size] - test_data = df.iloc[split_size:] - - selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20) - # Fine-tune the model - predictor = ScorePredictor() - predictor.fine_tune_model(train_data[selected_features], train_data['Target']) - - print(f"Evaluating fine-tuned model for {ticker}") - data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) - - if len(data) != 0: - if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50: - await save_json(ticker, data) - print(f"Saved results for {ticker}") - gc.collect() - except Exception as e: - print(f"Error processing {ticker}: {e}") - finally: - # Ensure any remaining cleanup if necessary - if 'predictor' in locals(): - del predictor # Explicitly delete the predictor to aid garbage collection async def run(): train_mode = True # Set this to False for fine-tuning and evaluation @@ -346,22 +352,18 @@ async def run(): if train_mode: # Warm start training - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") - warm_start_symbols = [row[0] for row in cursor.fetchall()] + cursor.execute(""" + SELECT DISTINCT symbol + FROM stocks + WHERE marketCap >= 500E6 + AND symbol NOT LIKE '%.%' + AND symbol NOT LIKE '%-%' + ORDER BY marketCap DESC; + """) + warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()] + print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') await warm_start_training(warm_start_symbols, con, skip_downloading) - else: - # Fine-tuning and evaluation for all stocks - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] - - print(f"Total tickers for fine-tuning: {len(stock_symbols)}") - start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") - end_date = datetime.today().strftime("%Y-%m-%d") - - tasks = [] - for ticker in tqdm(stock_symbols): - await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading) con.close() diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 129f98c6f0e93d83d6b7dd0092a13f5d18278f53..bfb68adbd9cd285da2a09a5da4f9d6dcdb86c545 100644 GIT binary patch delta 735 zcmZXS&ui2`6vub6yGgg3WFu^A{ekUPySCc3R;%j|x{>uzSE-6ESOf`SGf9KlBur+7 zu3-NGFJ-`k=OTDekY0P}MZudFFLP5-?9r!!22`RKh#>IOF6QB&l1ji(5<0OGQerP#1wWk@)-$kd{yZqJ0BwVNg z&H$zWX8~!k!u~?!9Ax&?U=8_@dxZ~TyCHTa`p66vo&`Wf z(g2u%D&QKR3Ahf})3w<05;$4lWOcx~CBy_K-;WHWqocp|tcrTa7TVEd?3#70$nCJS&jmS=O5ca7ysL}jzPBRZqxVJGi|MY3hdKaq=n{Sdjk*y z{(g75g5QGZ0%f+SR}WgSl>k`*o)5d7c!f-|HT@|%!HiOU45rF+)bC(&1 Hs-x^5_Y|`- delta 735 zcmZXSOKa3n6vulrnWWQ6G9%itKClm+GObQ)tISZ9W-OIbTd7dNg2)&qH)$}F#G6c^ zBiJwCrX27S6v2gpblZh41l_rDlbwikRemeh^L(G7eu;OQZl(;;+TCT!>Z6iRuwE zywLJllbz$S7XlkYiL9}4{_f&DM*0Ir1Likq7-Ej&)lDNZrXfn-CS`b+zB)UB36qG6 zh$+N0B59U6+=yL9P@VClLteEqmSf?Mo$0MQVMCGS#9{c`OKU!e3VNz zf4Ke7Z7Z>Fx_v3JFFIR>FguwoaJiG=U`OC(c7!a!$L!tif=WJ8newJ`k0-^MXr0k;-FMA6y~ z#NT@-C;Tm##?cRVw9(@a&=n9VjyUbLn!zTUfNkvw8HIeVTCAaX4Mj0>nl#D%uPC^= HDiQwxz(};4 diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 5b5b729..e0bc768 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -13,9 +13,8 @@ import asyncio import aiohttp import aiofiles import pickle -import os import time - +import os class ScorePredictor: @@ -24,10 +23,10 @@ class ScorePredictor: self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' self.model = lgb.LGBMClassifier( - n_estimators=200, # Number of boosting iterations - good balance between performance and training time + n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time learning_rate=0.005, # Smaller learning rate for better generalization - max_depth=8, # Controlled depth to prevent overfitting - num_leaves=32, # 2^max_depth, prevents overfitting while maintaining model complexity + max_depth=12, # Controlled depth to prevent overfitting + num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting subsample=0.8, # Use 80% of data per tree to reduce overfitting min_child_samples=20, # Minimum samples per leaf to ensure reliable splits diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc index 6ef30aa95316658e09d1ef1730e01c390a04c78d..842219bcf283b2cf07a29bb239b0f832477a6b62 100644 GIT binary patch delta 21 bcmZ2ry1