backend/app/ml_models/score_model.py
2024-10-03 00:12:20 +02:00

84 lines
3.3 KiB
Python

import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from collections import defaultdict
import asyncio
import aiohttp
import aiofiles
import pickle
import time
class ScorePredictor:
def __init__(self):
self.scaler = MinMaxScaler()
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
self.model = XGBClassifier(
n_estimators=200, # Increased from 100 due to problem complexity
max_depth=6, # Reduced to prevent overfitting with many features
learning_rate=0.1, # Added to control the learning process
colsample_bytree=0.8, # Added to randomly sample columns for each tree
subsample=0.8, # Added to randomly sample training data
reg_alpha=1, # L1 regularization to handle many features
reg_lambda=1, # L2 regularization to handle many features
random_state=42,
n_jobs=10
)
def preprocess_data(self, X):
X = np.where(np.isinf(X), np.nan, X)
X = np.nan_to_num(X)
X = self.scaler.fit_transform(X)
return X
def warm_start_training(self, X_train, y_train):
X_train = self.preprocess_data(X_train)
self.model.fit(X_train, y_train)
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
print("Warm start model saved.")
def fine_tune_model(self, X_train, y_train):
X_train = self.preprocess_data(X_train)
with open(f'{self.warm_start_model_path}', 'rb') as f:
self.model = pickle.load(f)
self.model.fit(X_train, y_train)
print("Model fine-tuned")
def evaluate_model(self, X_test, y_test):
X_test = self.preprocess_data(X_test)
test_predictions = self.model.predict_proba(X_test)
class_1_probabilities = test_predictions[:, 1]
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
#print(test_predictions)
test_precision = precision_score(y_test, binary_predictions)
test_accuracy = accuracy_score(y_test, binary_predictions)
print("Test Set Metrics:")
print(f"Precision: {round(test_precision * 100)}%")
print(f"Accuracy: {round(test_accuracy * 100)}%")
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
last_prediction_prob = class_1_probabilities[-1]
score = None
print(f"Last prediction probability: {last_prediction_prob}")
for threshold, value in zip(thresholds, scores):
if last_prediction_prob >= threshold:
score = value
break
return {'accuracy': round(test_accuracy * 100),
'precision': round(test_precision * 100),
'score': score}