update model

2024-10-05 22:08:56 +02:00 · 2024-10-05 22:08:56 +02:00 · 8521a4a404
commit 8521a4a404
parent 3e6ef8b540
4 changed files with 80 additions and 79 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -11,6 +11,7 @@ import pandas as pd
 from tqdm import tqdm
 import concurrent.futures
 import re
 import random
 from itertools import combinations
 from dotenv import load_dotenv
@ -41,6 +42,8 @@ async def fetch_historical_price(ticker):
                historical_data = data.get('historical', [])
                # Convert to DataFrame
                df = pd.DataFrame(historical_data).reset_index(drop=True)
                # Reverse the DataFrame so that the past dates are first
                df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
                return df
            else:
                raise Exception(f"Error fetching data: {response.status} {response.reason}")
@ -82,8 +85,11 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
    file_path = f"ml_models/training_data/ai-score/{ticker}.json"
    if os.path.exists(file_path):
-        with open(file_path, 'rb') as file:
+        try:
-            return pd.DataFrame(orjson.loads(file.read()))
+            with open(file_path, 'rb') as file:
                return pd.DataFrame(orjson.loads(file.read()))
        except:
            return pd.DataFrame()
    elif skip_downloading == False:
        try:
@ -176,15 +182,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
                item['price'] = round(data['close'], 2)
                # Dynamically add indicator values from ta_columns and stats_columns
-                for column in ta_columns + stats_columns:
+                for column in ta_columns+stats_columns:
                    item[column] = data.get(column, None)
            # Sort the combined data by date
            combined_data = sorted(combined_data, key=lambda x: x['date'])
            # Convert combined data to a DataFrame and drop rows with NaN values
            df_combined = pd.DataFrame(combined_data).dropna()
            fundamental_columns = [
                'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
@ -248,94 +252,96 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            pass
-async def chunked_gather(tickers, con, skip_downloading, chunk_size=10):
+async def chunked_gather(tickers, con, skip_downloading, chunk_size):
    test_size = 0.2
    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")
    df_train = pd.DataFrame()
-    df_test = pd.DataFrame()
+    df_test_dict = {}  # Store test data for each ticker
    all_test_data = []  # Store all test data for overall evaluation
    # Helper function to divide the tickers into chunks
    def chunks(lst, size):
        for i in range(0, len(lst), size):
-            yield lst[i:i+size]
+            yield lst[i:i + size]
-    
+
-    dfs = []
+    for chunk in tqdm(chunks(tickers, chunk_size)):
    for num, chunk in enumerate(tqdm(chunks(tickers, chunk_size))):
        # Create tasks for each chunk
        tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
        # Await the results for the current chunk
        chunk_results = await asyncio.gather(*tasks)
        # Accumulate the results
        dfs.extend(chunk_results)
        train_list = []
        test_list = []
-        for df in dfs:
+        for ticker, df in zip(chunk, chunk_results):
            try:
                # Split the data into training and testing sets
                split_size = int(len(df) * (1 - test_size))
                train_data = df.iloc[:split_size]
                test_data = df.iloc[split_size:]
                # Store test data for this ticker in a dictionary
                df_test_dict[ticker] = test_data
-                # Append to the lists
+                # Append train data for combined training
                train_list.append(train_data)
-                test_list.append(test_data)
+
                # Collect all test data for overall evaluation
                all_test_data.append(test_data)
            except:
                pass
-        # Concatenate all at once outside the loop
+        # Concatenate all train data together
-        df_train = pd.concat(train_list, ignore_index=True)
+        if train_list:
-        df_test = pd.concat(test_list, ignore_index=True)
+            df_train = pd.concat(train_list, ignore_index=True)
        df_train = df_train.sample(frac=1).reset_index(drop=True)
        df_test = df_test.sample(frac=1).reset_index(drop=True)
-        print('======Warm Start Train Set Datapoints======')
+        # Shuffle the combined training data
-        print(f'Batch Training: {num}')
+        df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
        print(len(df_train))
        print('====== Start Training Model on Combined Data ======')
        predictor = ScorePredictor()
-        selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200)
+        selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
        # Train the model on the combined training data
        predictor.warm_start_training(df_train[selected_features], df_train['Target'])
-        predictor.evaluate_model(df_test[selected_features], df_test['Target'])
+        print(f'Training complete on {len(df_train)} samples.')
        # Evaluate the model on the overall test dataset
        if all_test_data:
            overall_test_data = pd.concat(all_test_data, ignore_index=True)
            print('====== Evaluating on Overall Test Dataset ======')
            overall_evaluation_data = predictor.evaluate_model(overall_test_data[selected_features], overall_test_data['Target'])
            print(f'Overall Evaluation Metrics: {overall_evaluation_data}')
        # Evaluate the model for each ticker separately
        for ticker, test_data in df_test_dict.items():
            try:
                print(f"Fine-tuning the model for {ticker}")
                predictor.fine_tune_model(df_train[selected_features], df_train['Target'])
                print(f"Evaluating model for {ticker}")
                data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
                # Check if the evaluation data meets the criteria
                '''
                if (data['precision'] >= 50 and data['accuracy'] >= 50 and
                        data['accuracy'] < 100 and data['precision'] < 100 and
                        data['f1_score'] >= 50 and data['recall_score'] >= 50 and
                        data['roc_auc_score'] >= 50):
                '''
                # Save the evaluation data to a JSON file
                await save_json(ticker, data)
                print(f"Saved results for {ticker}")
            except Exception as e:
                print(e)
                pass
 async def warm_start_training(tickers, con, skip_downloading):
-    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=50)
+    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
 async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading):
    try:
        df = await download_data(ticker,con, start_date, end_date, skip_downloading)
        if df is None or len(df) == 0:
            print(f"No data available for {ticker}")
            return
        test_size = 0.2
        split_size = int(len(df) * (1-test_size))
        train_data = df.iloc[:split_size]
        test_data = df.iloc[split_size:]
        selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
        # Fine-tune the model
        predictor = ScorePredictor()
        predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
        print(f"Evaluating fine-tuned model for {ticker}")
        data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
        if len(data) != 0:
            if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50:
                await save_json(ticker, data)
                print(f"Saved results for {ticker}")
        gc.collect()
    except Exception as e:
        print(f"Error processing {ticker}: {e}")
    finally:
        # Ensure any remaining cleanup if necessary
        if 'predictor' in locals():
            del predictor  # Explicitly delete the predictor to aid garbage collection
 async def run():
    train_mode = True  # Set this to False for fine-tuning and evaluation
@ -346,22 +352,18 @@ async def run():
    if train_mode:
        # Warm start training
-        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
+        cursor.execute("""
-        warm_start_symbols = [row[0] for row in cursor.fetchall()]
+            SELECT DISTINCT symbol 
            FROM stocks 
            WHERE marketCap >= 500E6 
              AND symbol NOT LIKE '%.%' 
              AND symbol NOT LIKE '%-%' 
            ORDER BY marketCap DESC;
        """)
        warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
        print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
        await warm_start_training(warm_start_symbols, con, skip_downloading)
    else:
        # Fine-tuning and evaluation for all stocks
        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%'")
        stock_symbols = [row[0] for row in cursor.fetchall()]
        print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
        end_date = datetime.today().strftime("%Y-%m-%d")
        tasks = []
        for ticker in tqdm(stock_symbols):
            await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading)
    con.close()
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -13,9 +13,8 @@ import asyncio
 import aiohttp
 import aiofiles
 import pickle
 import os
 import time
-
+import os
 class ScorePredictor:
@ -24,10 +23,10 @@ class ScorePredictor:
        self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
        self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
        self.model = lgb.LGBMClassifier(
-            n_estimators=200,           # Number of boosting iterations - good balance between performance and training time
+            n_estimators=20_000,           # Number of boosting iterations - good balance between performance and training time
            learning_rate=0.005,         # Smaller learning rate for better generalization
-            max_depth=8,                 # Controlled depth to prevent overfitting
+            max_depth=12,                 # Controlled depth to prevent overfitting
-            num_leaves=32,            # 2^max_depth, prevents overfitting while maintaining model complexity
+            num_leaves=2**12,            # 2^max_depth, prevents overfitting while maintaining model complexity
            colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
            subsample=0.8,              # Use 80% of data per tree to reduce overfitting
            min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
--- a/app/utils/pycache/feature_engineering.cpython-310.pyc
+++ b/app/utils/pycache/feature_engineering.cpython-310.pyc