137 lines
6.0 KiB
Python
137 lines
6.0 KiB
Python
import pandas as pd
|
|
from datetime import datetime, timedelta
|
|
import numpy as np
|
|
from xgboost import XGBClassifier
|
|
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.decomposition import PCA
|
|
import lightgbm as lgb
|
|
|
|
from tqdm import tqdm
|
|
from collections import defaultdict
|
|
import asyncio
|
|
import aiohttp
|
|
import aiofiles
|
|
import pickle
|
|
import time
|
|
|
|
|
|
class ScorePredictor:
|
|
def __init__(self):
|
|
self.scaler = MinMaxScaler()
|
|
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
|
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
|
self.model = lgb.LGBMClassifier(
|
|
n_estimators=1000, # Number of boosting iterations - good balance between performance and training time
|
|
learning_rate=0.005, # Smaller learning rate for better generalization
|
|
max_depth=8, # Controlled depth to prevent overfitting
|
|
num_leaves=31, # 2^5-1, prevents overfitting while maintaining model complexity
|
|
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
|
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
|
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits
|
|
random_state=42, # For reproducibility
|
|
class_weight='balanced', # Important for potentially imbalanced stock data
|
|
reg_alpha=0.1, # L1 regularization
|
|
reg_lambda=0.1, # L2 regularization
|
|
n_jobs=-1, # Use all CPU cores
|
|
verbose=-1, # Reduce output noise
|
|
warm_start= True,
|
|
)
|
|
'''
|
|
XGBClassifier(
|
|
n_estimators=200,
|
|
max_depth=5,
|
|
learning_rate=0.1,
|
|
random_state=42,
|
|
n_jobs=10
|
|
)
|
|
'''
|
|
|
|
def preprocess_train_data(self, X):
|
|
"""Preprocess training data by scaling and applying PCA."""
|
|
X = np.where(np.isinf(X), np.nan, X)
|
|
X = np.nan_to_num(X)
|
|
X = self.scaler.fit_transform(X) # Transform using the fitted scaler
|
|
return self.pca.fit_transform(X) # Fit PCA and transform
|
|
|
|
def preprocess_test_data(self, X):
|
|
"""Preprocess test data by scaling and applying PCA."""
|
|
X = np.where(np.isinf(X), np.nan, X)
|
|
X = np.nan_to_num(X)
|
|
X = self.scaler.transform(X) # Transform using the fitted scaler
|
|
return self.pca.transform(X) # Transform using the fitted PCA
|
|
|
|
def warm_start_training(self, X_train, y_train):
|
|
X_train = self.preprocess_train_data(X_train)
|
|
self.model.fit(X_train, y_train)
|
|
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
|
|
print("Warm start model saved.")
|
|
|
|
def batch_train_model(self, X_train, y_train, batch_size=1000):
|
|
"""Train the model in batches to handle large datasets."""
|
|
num_samples = len(X_train)
|
|
for start_idx in range(0, num_samples, batch_size):
|
|
end_idx = min(start_idx + batch_size, num_samples)
|
|
X_batch = X_train[start_idx:end_idx]
|
|
y_batch = y_train[start_idx:end_idx]
|
|
|
|
# Preprocess each batch
|
|
X_batch = self.preprocess_train_data(X_batch)
|
|
|
|
# Fit model on each batch (incremental training with warm_start=True)
|
|
self.model.fit(X_batch, y_batch, eval_set=[(X_batch, y_batch)])
|
|
|
|
print(f"Trained on batch {start_idx} to {end_idx}")
|
|
|
|
# After batch training, save the model
|
|
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
|
|
print("Batch learning completed and model saved.")
|
|
|
|
def fine_tune_model(self, X_train, y_train):
|
|
X_train = self.preprocess_train_data(X_train)
|
|
with open(f'{self.warm_start_model_path}', 'rb') as f:
|
|
self.model = pickle.load(f)
|
|
|
|
self.model.fit(X_train, y_train)
|
|
print("Model fine-tuned")
|
|
|
|
|
|
def evaluate_model(self, X_test, y_test):
|
|
X_test = self.preprocess_test_data(X_test)
|
|
|
|
test_predictions = self.model.predict_proba(X_test)
|
|
class_1_probabilities = test_predictions[:, 1]
|
|
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
|
#print(test_predictions)
|
|
test_precision = precision_score(y_test, binary_predictions)
|
|
test_accuracy = accuracy_score(y_test, binary_predictions)
|
|
test_f1_score = f1_score(y_test, binary_predictions)
|
|
test_recall_score = recall_score(y_test, binary_predictions)
|
|
test_roc_auc_score = roc_auc_score(y_test, binary_predictions)
|
|
|
|
print("Test Set Metrics:")
|
|
print(f"Precision: {round(test_precision * 100)}%")
|
|
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
|
print(f"F1 Score: {round(test_f1_score * 100)}%")
|
|
print(f"Recall Score: {round(test_recall_score * 100)}%")
|
|
print(f"ROC AUC Score: {round(test_roc_auc_score * 100)}%")
|
|
|
|
print(pd.DataFrame({'y_test': y_test, 'y_pred': binary_predictions}))
|
|
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0]
|
|
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
|
|
|
last_prediction_prob = class_1_probabilities[-1]
|
|
score = None
|
|
print(f"Last prediction probability: {last_prediction_prob}")
|
|
|
|
for threshold, value in zip(thresholds, scores):
|
|
if last_prediction_prob >= threshold:
|
|
score = value
|
|
break
|
|
|
|
return {'accuracy': round(test_accuracy * 100),
|
|
'precision': round(test_precision * 100),
|
|
'f1_score': round(test_f1_score * 100),
|
|
'recall_score': round(test_recall_score * 100),
|
|
'roc_auc_score': round(test_roc_auc_score * 100),
|
|
'score': score} |