From 8521a4a4046699e83e6a4eb5013e0602454f8acd Mon Sep 17 00:00:00 2001
From: MuslemRahimi <moslem_rahimi@hotmail.de>
Date: Sat, 5 Oct 2024 22:08:56 +0200
Subject: [PATCH] update model

---
 app/cron_ai_score.py                          | 150 +++++++++---------
 .../__pycache__/score_model.cpython-310.pyc   | Bin 4004 -> 4004 bytes
 app/ml_models/score_model.py                  |   9 +-
 .../feature_engineering.cpython-310.pyc       | Bin 6816 -> 6816 bytes
 4 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py
index 43f8b43..bbeaeae 100644
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@@ -11,6 +11,7 @@ import pandas as pd
 from tqdm import tqdm
 import concurrent.futures
 import re
+import random
 from itertools import combinations
 
 from dotenv import load_dotenv
@@ -41,6 +42,8 @@ async def fetch_historical_price(ticker):
                 historical_data = data.get('historical', [])
                 # Convert to DataFrame
                 df = pd.DataFrame(historical_data).reset_index(drop=True)
+                # Reverse the DataFrame so that the past dates are first
+                df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
                 return df
             else:
                 raise Exception(f"Error fetching data: {response.status} {response.reason}")
@@ -82,8 +85,11 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
     file_path = f"ml_models/training_data/ai-score/{ticker}.json"
 
     if os.path.exists(file_path):
-        with open(file_path, 'rb') as file:
-            return pd.DataFrame(orjson.loads(file.read()))
+        try:
+            with open(file_path, 'rb') as file:
+                return pd.DataFrame(orjson.loads(file.read()))
+        except:
+            return pd.DataFrame()
     elif skip_downloading == False:
 
         try:
@@ -176,15 +182,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
                 item['price'] = round(data['close'], 2)
 
                 # Dynamically add indicator values from ta_columns and stats_columns
-                for column in ta_columns + stats_columns:
+                for column in ta_columns+stats_columns:
                     item[column] = data.get(column, None)
 
             # Sort the combined data by date
             combined_data = sorted(combined_data, key=lambda x: x['date'])
-
             # Convert combined data to a DataFrame and drop rows with NaN values
             df_combined = pd.DataFrame(combined_data).dropna()
-
             
             fundamental_columns = [
                 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
@@ -248,94 +252,96 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
             pass
 
 
-async def chunked_gather(tickers, con, skip_downloading, chunk_size=10):
+async def chunked_gather(tickers, con, skip_downloading, chunk_size):
     test_size = 0.2
     start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
     end_date = datetime.today().strftime("%Y-%m-%d")
     df_train = pd.DataFrame()
-    df_test = pd.DataFrame()
+    df_test_dict = {}  # Store test data for each ticker
+    all_test_data = []  # Store all test data for overall evaluation
 
     # Helper function to divide the tickers into chunks
     def chunks(lst, size):
         for i in range(0, len(lst), size):
-            yield lst[i:i+size]
-    
-    dfs = []
-    
-    for num, chunk in enumerate(tqdm(chunks(tickers, chunk_size))):
+            yield lst[i:i + size]
+
+    for chunk in tqdm(chunks(tickers, chunk_size)):
         # Create tasks for each chunk
         tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
         # Await the results for the current chunk
         chunk_results = await asyncio.gather(*tasks)
-        # Accumulate the results
-        dfs.extend(chunk_results)
 
         train_list = []
-        test_list = []
 
-        for df in dfs:
+        for ticker, df in zip(chunk, chunk_results):
             try:
+                # Split the data into training and testing sets
                 split_size = int(len(df) * (1 - test_size))
                 train_data = df.iloc[:split_size]
                 test_data = df.iloc[split_size:]
+
+                # Store test data for this ticker in a dictionary
+                df_test_dict[ticker] = test_data
                 
-                # Append to the lists
+                # Append train data for combined training
                 train_list.append(train_data)
-                test_list.append(test_data)
+
+                # Collect all test data for overall evaluation
+                all_test_data.append(test_data)
+
             except:
                 pass
 
-        # Concatenate all at once outside the loop
-        df_train = pd.concat(train_list, ignore_index=True)
-        df_test = pd.concat(test_list, ignore_index=True)
-        df_train = df_train.sample(frac=1).reset_index(drop=True)
-        df_test = df_test.sample(frac=1).reset_index(drop=True)
+        # Concatenate all train data together
+        if train_list:
+            df_train = pd.concat(train_list, ignore_index=True)
 
-        print('======Warm Start Train Set Datapoints======')
-        print(f'Batch Training: {num}')
-        print(len(df_train))
+        # Shuffle the combined training data
+        df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
 
+        print('====== Start Training Model on Combined Data ======')
         predictor = ScorePredictor()
-        selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200)
+        selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
+        
+        # Train the model on the combined training data
         predictor.warm_start_training(df_train[selected_features], df_train['Target'])
-        predictor.evaluate_model(df_test[selected_features], df_test['Target'])
+        print(f'Training complete on {len(df_train)} samples.')
+
+        # Evaluate the model on the overall test dataset
+        if all_test_data:
+            overall_test_data = pd.concat(all_test_data, ignore_index=True)
+            print('====== Evaluating on Overall Test Dataset ======')
+            overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
+            print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
+
+        # Evaluate the model for each ticker separately
+        for ticker, test_data in df_test_dict.items():
+            try:
+                print(f"Fine-tuning the model for {ticker}")
+                predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
+
+                print(f"Evaluating model for {ticker}")
+                data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
+
+                # Check if the evaluation data meets the criteria
+                '''
+                if (data['precision'] >= 50 and data['accuracy'] >= 50 and
+                        data['accuracy'] < 100 and data['precision'] < 100 and
+                        data['f1_score'] >= 50 and data['recall_score'] >= 50 and
+                        data['roc_auc_score'] >= 50):
+                '''
+                # Save the evaluation data to a JSON file
+                await save_json(ticker, data)
+                print(f"Saved results for {ticker}")
+            except Exception as e:
+                print(e)
+                pass
 
     
 async def warm_start_training(tickers, con, skip_downloading):
     
-    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=50)
+    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
 
-async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading):
-    try:
-        df = await download_data(ticker,con, start_date, end_date, skip_downloading)
-        if df is None or len(df) == 0:
-            print(f"No data available for {ticker}")
-            return
-        
-        test_size = 0.2
-        split_size = int(len(df) * (1-test_size))
-        train_data = df.iloc[:split_size]
-        test_data = df.iloc[split_size:]
-        
-        selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
-        # Fine-tune the model
-        predictor = ScorePredictor()
-        predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
-        
-        print(f"Evaluating fine-tuned model for {ticker}")
-        data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
-        
-        if len(data) != 0:
-            if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50:
-                await save_json(ticker, data)
-                print(f"Saved results for {ticker}")
-        gc.collect()
-    except Exception as e:
-        print(f"Error processing {ticker}: {e}")
-    finally:
-        # Ensure any remaining cleanup if necessary
-        if 'predictor' in locals():
-            del predictor  # Explicitly delete the predictor to aid garbage collection
 
 async def run():
     train_mode = True  # Set this to False for fine-tuning and evaluation
@@ -346,22 +352,18 @@ async def run():
     
     if train_mode:
         # Warm start training
-        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
-        warm_start_symbols = [row[0] for row in cursor.fetchall()]
+        cursor.execute("""
+            SELECT DISTINCT symbol 
+            FROM stocks 
+            WHERE marketCap >= 500E6 
+              AND symbol NOT LIKE '%.%' 
+              AND symbol NOT LIKE '%-%' 
+            ORDER BY marketCap DESC;
+        """)
+        warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
+
         print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
         await warm_start_training(warm_start_symbols, con, skip_downloading)
-    else:
-        # Fine-tuning and evaluation for all stocks
-        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
-        stock_symbols = [row[0] for row in cursor.fetchall()]
-        
-        print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
-        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
-        end_date = datetime.today().strftime("%Y-%m-%d")
-
-        tasks = []
-        for ticker in tqdm(stock_symbols):
-            await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading)
             
     con.close()
 
diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc
index 129f98c6f0e93d83d6b7dd0092a13f5d18278f53..bfb68adbd9cd285da2a09a5da4f9d6dcdb86c545 100644
GIT binary patch
delta 735
zcmZXS&ui2`6vub6yGgg3WFu^A{ekUPySCc3R;%j|x{>uzSE-6ESOf`SGf9KlBur+7
zu3-NGFJ-`k=OTDekY0P}MZudFFLP5-?9r<iotGAD^+7(F`M#O&dwFDiczsxlG_5})
z;>!!<y??8{lm`8g+nJ2_r2J;?bB~(asz~}7W8;H#1fg;EEqe<Mu<6{dRz)PZ=K9p4
z#10cTA{{a+v{w>22`RKh#>IOF6QB&l1ji(5<0OGQerP#1wWk@)-$kd{yZqJ0BwVNg
z&H$zWX8~!k!u~?!9Ax&?U=8_@d<n!0?4kS<&9FPl;Njy!;JGNqA(vyn{V2dZ7euy0
zs%%vm8cTasmUbHA%F6Bnx%-?ql_6xZPs-*h`1g$Y$icyg>xZ~TyCHTa`p66vo&`Wf
z(g2u%D&QKR3Ahf})3w<05;$4lWOcx~CBy_K-;WHWqocp|tcrTa7TVE<QD{n)10v;#
zN5l`)bJO^4dgs&lYLc5QDNYs3BE4Mz^s(2)QPg4|i}&Ta5X86F)XS*JJhh^y6OhFO
za6-vEd#-L>d?3#70$nCJS&jmS=O5ca7ysL}jzPBRZqxVJGi|MY3hdKaq=n{Sdjk*y
z{(g75g5QGZ0%f+SR}WgSl>k`*o)5d7c!f-|HT@|%!HiOU45rF+)bC(&1<t|U|4dm>
Hs-x^5_Y|`-

delta 735
zcmZXSOKa3n6vulrnWWQ6G9%itKClm+GObQ)tISZ9W-OIbTd7dNg2)&qH)$}F#G6c^
zBiJwCrX27S6v2gpblZh41l_rDlbwi<S-N$j_hbsTdLX~tbN-yiJ;~1C&Y<e6YF`hx
z?;FH^`&xZ26guy@Yd=2rYT<J`(mTD}>kRemeh^L(G7eu;OQZl(;;+TCT!>Z6iRuwE
zywLJllbz$S7XlkYiL9}4{_f&DM*0Ir1Likq7-Ej&)lDNZrXfn-CS`b+zB)UB36qG6
zh$+N0B59U6+=yL9<v<UZl}<=A7_7p5=_jecid;BZ<bhc91M0<6;4~h(G!|XoFj)z<
z<bh%`tGw17D3&+(55ziDc~2f7HTWd&ZC&Bz+WA|)N!=mK@n|_}dDI;1Wfg3!Bd{Z@
zA+94zh&jYO;s)Zt)ZC48SgDSc>P@VClLteEqmSf?Mo$0MQVMCGS#9{c`OKU!e3VNz
zf4Ke7Z7Z>Fx_v3JFFIR>FguwoaJiG=U`OC(c7!a!$L!tif=W<UX2?9)%CMGfK;j#q
zVZ>J8newJ`k0-^MXr0k;-FMA6y<xa5`tQ^l#jp|W54{Rc)$Qs9?3e5!`4Wrh;&s>~
z#NT@-C;Tm##?cRVw9(@a&=n9VjyUbLn!zTUfNkvw8HIeVTCAaX4Mj0>nl#D%uPC^=
HDiQwxz(};4

diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py
index 5b5b729..e0bc768 100644
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@@ -13,9 +13,8 @@ import asyncio
 import aiohttp
 import aiofiles
 import pickle
-import os
 import time
-
+import os
 
 
 class ScorePredictor:
@@ -24,10 +23,10 @@ class ScorePredictor:
         self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
         self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
         self.model = lgb.LGBMClassifier(
-            n_estimators=200,           # Number of boosting iterations - good balance between performance and training time
+            n_estimators=20_000,           # Number of boosting iterations - good balance between performance and training time
             learning_rate=0.005,         # Smaller learning rate for better generalization
-            max_depth=8,                 # Controlled depth to prevent overfitting
-            num_leaves=32,            # 2^max_depth, prevents overfitting while maintaining model complexity
+            max_depth=12,                 # Controlled depth to prevent overfitting
+            num_leaves=2**12,            # 2^max_depth, prevents overfitting while maintaining model complexity
             colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
             subsample=0.8,              # Use 80% of data per tree to reduce overfitting
             min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc
index 6ef30aa95316658e09d1ef1730e01c390a04c78d..842219bcf283b2cf07a29bb239b0f832477a6b62 100644
GIT binary patch
delta 21
bcmZ2ry1<ktpO=@50SL-F7}GcMOpyWrI86l{

delta 21
bcmZ2ry1<ktpO=@50SNAj|4-Y<GersjK2-(|