backend/app/ml_models/score_model.py

import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import lightgbm as lgb

from tqdm import tqdm
from collections import defaultdict
import asyncio
import aiohttp
import aiofiles
import pickle
import time


class ScorePredictor:
    def __init__(self):
        self.scaler = MinMaxScaler()
        self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
        self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
        self.model = lgb.LGBMClassifier(
            n_estimators=1000,           # Number of boosting iterations - good balance between performance and training time
            learning_rate=0.005,         # Smaller learning rate for better generalization
            max_depth=8,                 # Controlled depth to prevent overfitting
            num_leaves=31,              # 2^5-1, prevents overfitting while maintaining model complexity
            colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
            subsample=0.8,              # Use 80% of data per tree to reduce overfitting
            min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
            random_state=42,            # For reproducibility
            class_weight='balanced',    # Important for potentially imbalanced stock data
            reg_alpha=0.1,             # L1 regularization
            reg_lambda=0.1,            # L2 regularization
            n_jobs=-1,                 # Use all CPU cores
            verbose=-1,                # Reduce output noise
            warm_start= True,
        )
        '''
        XGBClassifier(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            n_jobs=10
        )
        '''

    def preprocess_train_data(self, X):
        """Preprocess training data by scaling and applying PCA."""
        X = np.where(np.isinf(X), np.nan, X)
        X = np.nan_to_num(X)
        X = self.scaler.fit_transform(X)  # Transform using the fitted scaler
        return self.pca.fit_transform(X)  # Fit PCA and transform

    def preprocess_test_data(self, X):
        """Preprocess test data by scaling and applying PCA."""
        X = np.where(np.isinf(X), np.nan, X)
        X = np.nan_to_num(X)
        X = self.scaler.transform(X)  # Transform using the fitted scaler
        return self.pca.transform(X)  # Transform using the fitted PCA

    def warm_start_training(self, X_train, y_train):
        X_train = self.preprocess_train_data(X_train)
        self.model.fit(X_train, y_train)
        pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
        print("Warm start model saved.")

    def batch_train_model(self, X_train, y_train, batch_size=1000):
        """Train the model in batches to handle large datasets."""
        num_samples = len(X_train)
        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            X_batch = X_train[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]

            # Preprocess each batch
            X_batch = self.preprocess_train_data(X_batch)

            # Fit model on each batch (incremental training with warm_start=True)
            self.model.fit(X_batch, y_batch, eval_set=[(X_batch, y_batch)])

            print(f"Trained on batch {start_idx} to {end_idx}")

        # After batch training, save the model
        pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
        print("Batch learning completed and model saved.")

    def fine_tune_model(self, X_train, y_train):
        X_train = self.preprocess_train_data(X_train)
        with open(f'{self.warm_start_model_path}', 'rb') as f:
            self.model = pickle.load(f)

        self.model.fit(X_train, y_train)
        print("Model fine-tuned")


    def evaluate_model(self, X_test, y_test):
        X_test = self.preprocess_test_data(X_test)

        test_predictions = self.model.predict_proba(X_test)
        class_1_probabilities = test_predictions[:, 1]
        binary_predictions = (class_1_probabilities >= 0.5).astype(int)
        #print(test_predictions)
        test_precision = precision_score(y_test, binary_predictions)
        test_accuracy = accuracy_score(y_test, binary_predictions)
        test_f1_score = f1_score(y_test, binary_predictions)
        test_recall_score = recall_score(y_test, binary_predictions)
        test_roc_auc_score = roc_auc_score(y_test, binary_predictions)

        print("Test Set Metrics:")
        print(f"Precision: {round(test_precision * 100)}%")
        print(f"Accuracy: {round(test_accuracy * 100)}%")
        print(f"F1 Score: {round(test_f1_score * 100)}%")
        print(f"Recall Score: {round(test_recall_score * 100)}%")
        print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%")

        print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
        thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
        scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

        last_prediction_prob = class_1_probabilities[-1]
        score = None
        print(f"Last prediction probability: {last_prediction_prob}")

        for threshold, value in zip(thresholds, scores):
            if last_prediction_prob >= threshold:
                score = value
                break

        return {'accuracy': round(test_accuracy * 100),
                'precision': round(test_precision * 100),
                'f1_score': round(test_f1_score * 100),
                'recall_score': round(test_recall_score * 100),
                'roc_auc_score': round(test_roc_auc_score * 100),
                'score': score}