import yfinance as yf import pandas as pd from datetime import datetime, timedelta import numpy as np from xgboost import XGBClassifier from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score from sklearn.preprocessing import MinMaxScaler from sklearn.decomposition import PCA # Import PCA import lightgbm as lgb from tqdm import tqdm from collections import defaultdict import asyncio import aiohttp import aiofiles import pickle import time class ScorePredictor: def __init__(self): self.scaler = MinMaxScaler() self.pca = PCA(n_components=5) self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' self.model = lgb.LGBMClassifier( n_estimators=20_000, # If you want to use a larger model we've found 20_000 trees to be better learning_rate=0.01, # and a learning rate of 0.001 max_depth=20, # and max_depth=6 num_leaves=2**6-1, # and num_leaves of 2**6-1 colsample_bytree=0.1 ) ''' XGBClassifier( n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=10 ) ''' def preprocess_train_data(self, X): """Preprocess training data by scaling and applying PCA.""" X = np.where(np.isinf(X), np.nan, X) X = np.nan_to_num(X) X = self.scaler.fit_transform(X) # Transform using the fitted scaler return X#self.pca.fit_transform(X) # Fit PCA and transform def preprocess_test_data(self, X): """Preprocess test data by scaling and applying PCA.""" X = np.where(np.isinf(X), np.nan, X) X = np.nan_to_num(X) X = self.scaler.transform(X) # Transform using the fitted scaler return X#self.pca.transform(X) # Transform using the fitted PCA def warm_start_training(self, X_train, y_train): X_train = self.preprocess_train_data(X_train) self.model.fit(X_train, y_train) pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb')) print("Warm start model saved.") def fine_tune_model(self, X_train, y_train): X_train = self.preprocess_train_data(X_train) with open(f'{self.warm_start_model_path}', 'rb') as f: self.model = pickle.load(f) self.model.fit(X_train, y_train) print("Model fine-tuned") def evaluate_model(self, X_test, y_test): X_test = self.preprocess_test_data(X_test) test_predictions = self.model.predict_proba(X_test) class_1_probabilities = test_predictions[:, 1] binary_predictions = (class_1_probabilities >= 0.5).astype(int) #print(test_predictions) test_precision = precision_score(y_test, binary_predictions) test_accuracy = accuracy_score(y_test, binary_predictions) test_f1_score = f1_score(y_test, binary_predictions) test_recall_score = recall_score(y_test, binary_predictions) test_roc_auc_score = roc_auc_score(y_test, binary_predictions) print("Test Set Metrics:") print(f"Precision: {round(test_precision * 100)}%") print(f"Accuracy: {round(test_accuracy * 100)}%") print(f"F1 Score: {round(test_f1_score * 100)}%") print(f"Recall Score: {round(test_recall_score * 100)}%") print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%") print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions})) thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0] scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] last_prediction_prob = class_1_probabilities[-1] score = None print(f"Last prediction probability: {last_prediction_prob}") for threshold, value in zip(thresholds, scores): if last_prediction_prob >= threshold: score = value break return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), 'f1_score': round(test_f1_score * 100), 'recall_score': round(test_recall_score * 100), 'roc_auc_score': round(test_roc_auc_score * 100), 'score': score}