switch from nn to xgb
This commit is contained in:
parent
699594cb64
commit
75b9cdc2b1
@ -332,19 +332,39 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
async def warm_start_training(tickers, con):
|
||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||
|
||||
df_train = pd.DataFrame()
|
||||
df_test = pd.DataFrame()
|
||||
test_size = 0.2
|
||||
|
||||
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
|
||||
|
||||
df_train = pd.concat(dfs, ignore_index=True)
|
||||
df_train = df_train.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
train_list = []
|
||||
test_list = []
|
||||
|
||||
for df in dfs:
|
||||
try:
|
||||
split_size = int(len(df) * (1 - test_size))
|
||||
train_data = df.iloc[:split_size]
|
||||
test_data = df.iloc[split_size:]
|
||||
|
||||
# Append to the lists
|
||||
train_list.append(train_data)
|
||||
test_list.append(test_data)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Concatenate all at once outside the loop
|
||||
df_train = pd.concat(train_list, ignore_index=True)
|
||||
df_test = pd.concat(test_list, ignore_index=True)
|
||||
|
||||
print('======Warm Start Train Set Datapoints======')
|
||||
df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
|
||||
print(len(df_train))
|
||||
|
||||
predictor = ScorePredictor()
|
||||
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
|
||||
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
|
||||
predictor.evaluate_model(df_train[selected_features], df_train['Target'])
|
||||
predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
||||
|
||||
return predictor
|
||||
|
||||
@ -369,7 +389,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||
|
||||
if len(data) != 0:
|
||||
if data['precision'] >= 50 and data['accuracy'] >= 50:
|
||||
if data['precision'] >= 60 and data['accuracy'] >= 60 and data['accuracy'] < 100 and data['precision'] < 100:
|
||||
res = {'score': data['score']}
|
||||
await save_json(ticker, res)
|
||||
print(f"Saved results for {ticker}")
|
||||
@ -389,23 +409,23 @@ async def run():
|
||||
|
||||
if train_mode:
|
||||
# Warm start training
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
warm_start_symbols = [row[0] for row in cursor.fetchall()]
|
||||
print('Warm Start Training for:', warm_start_symbols)
|
||||
predictor = await warm_start_training(warm_start_symbols, con)
|
||||
else:
|
||||
# Fine-tuning and evaluation for all stocks
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
|
||||
stock_symbols = [row[0] for row in cursor.fetchall()]
|
||||
stock_symbols = ['GME'] #[row[0] for row in cursor.fetchall()]
|
||||
|
||||
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
|
||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||
tasks = []
|
||||
for ticker in tqdm(stock_symbols):
|
||||
tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date))
|
||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date)
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
#await asyncio.gather(*tasks)
|
||||
|
||||
con.close()
|
||||
|
||||
|
||||
Binary file not shown.
@ -5,18 +5,7 @@ from sklearn.ensemble import RandomForestClassifier
|
||||
import numpy as np
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
||||
from keras.models import Sequential, Model
|
||||
from keras.layers import Input, Multiply, Reshape, LSTM, Dense, Dropout, BatchNormalization, GlobalAveragePooling1D, MaxPooling1D, Bidirectional
|
||||
from keras.optimizers import Adam
|
||||
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
|
||||
from keras.models import load_model
|
||||
from sklearn.feature_selection import SelectKBest, f_classif
|
||||
from tensorflow.keras.backend import clear_session
|
||||
from keras import regularizers
|
||||
from keras.layers import Layer
|
||||
from tensorflow.keras import backend as K
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
@ -26,62 +15,11 @@ import aiofiles
|
||||
import pickle
|
||||
import time
|
||||
|
||||
class SelfAttention(Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super(SelfAttention, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
|
||||
initializer='random_normal', trainable=True)
|
||||
super(SelfAttention, self).build(input_shape)
|
||||
|
||||
def call(self, x):
|
||||
# Alignment scores. Pass them through tanh function
|
||||
e = K.tanh(K.dot(x, self.W))
|
||||
# Remove dimension of size 1
|
||||
e = K.squeeze(e, axis=-1)
|
||||
# Compute the weights
|
||||
alpha = K.softmax(e)
|
||||
# Reshape to tensor of same shape as x for multiplication
|
||||
alpha = K.expand_dims(alpha, axis=-1)
|
||||
# Compute the context vector
|
||||
context = x * alpha
|
||||
context = K.sum(context, axis=1)
|
||||
return context, alpha
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])
|
||||
|
||||
|
||||
class ScorePredictor:
|
||||
def __init__(self):
|
||||
self.scaler = MinMaxScaler()
|
||||
self.model = None
|
||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras'
|
||||
|
||||
def build_model(self):
|
||||
clear_session()
|
||||
|
||||
inputs = Input(shape=(231,))
|
||||
|
||||
x = Dense(128, activation='leaky_relu')(inputs)
|
||||
x = BatchNormalization()(x)
|
||||
x = Dropout(0.2)(x)
|
||||
|
||||
for units in [64,32,16]:
|
||||
x = Dense(units, activation='leaky_relu')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Dropout(0.2)(x)
|
||||
|
||||
x = Reshape((16, 1))(x)
|
||||
x, _ = SelfAttention()(x)
|
||||
outputs = Dense(2, activation='softmax')(x)
|
||||
|
||||
model = Model(inputs=inputs, outputs=outputs)
|
||||
optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
|
||||
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
||||
|
||||
return model
|
||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||
self.model = XGBClassifier(n_estimators=100, max_depth = 10, min_samples_split=5, random_state=42, n_jobs=10)
|
||||
|
||||
def preprocess_data(self, X):
|
||||
X = np.where(np.isinf(X), np.nan, X)
|
||||
@ -91,37 +29,24 @@ class ScorePredictor:
|
||||
|
||||
def warm_start_training(self, X_train, y_train):
|
||||
X_train = self.preprocess_data(X_train)
|
||||
self.model = self.build_model()
|
||||
|
||||
checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min')
|
||||
early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
|
||||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=30, min_lr=0.001)
|
||||
|
||||
self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
|
||||
self.model.save(self.warm_start_model_path)
|
||||
self.model.fit(X_train, y_train)
|
||||
pickle.dump(self.model, open(f'{self.warm_start_model_path}', 'wb'))
|
||||
print("Warm start model saved.")
|
||||
|
||||
def fine_tune_model(self, X_train, y_train):
|
||||
X_train = self.preprocess_data(X_train)
|
||||
#batch_size = min(64, max(16, len(X_train) // 10))
|
||||
|
||||
if self.model is None:
|
||||
self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention})
|
||||
|
||||
#early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
|
||||
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.01)
|
||||
with open(f'{self.warm_start_model_path}', 'rb') as f:
|
||||
self.model = pickle.load(f)
|
||||
|
||||
self.model.fit(X_train, y_train, epochs=150, batch_size=16, validation_split=0.1)
|
||||
self.model.fit(X_train, y_train)
|
||||
print("Model fine-tuned")
|
||||
|
||||
|
||||
def evaluate_model(self, X_test, y_test):
|
||||
X_test = self.preprocess_data(X_test)
|
||||
|
||||
if self.model is None:
|
||||
raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.")
|
||||
|
||||
test_predictions = self.model.predict(X_test)
|
||||
test_predictions = self.model.predict_proba(X_test)
|
||||
class_1_probabilities = test_predictions[:, 1]
|
||||
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
||||
#print(test_predictions)
|
||||
@ -146,15 +71,4 @@ class ScorePredictor:
|
||||
|
||||
return {'accuracy': round(test_accuracy * 100),
|
||||
'precision': round(test_precision * 100),
|
||||
'score': score}
|
||||
|
||||
def feature_selection(self, X_train, y_train, k=100):
|
||||
print('Feature selection:')
|
||||
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
|
||||
selector = SelectKBest(score_func=f_classif, k=k)
|
||||
selector.fit(X_train, y_train)
|
||||
|
||||
selector.transform(X_train)
|
||||
selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
|
||||
|
||||
return selected_features
|
||||
'score': score}
|
||||
Loading…
x
Reference in New Issue
Block a user