add warm startu to model
This commit is contained in:
parent
8872b5d1c7
commit
d0b5cd5aaa
@ -38,23 +38,6 @@ def calculate_fdi(high, low, close, window=30):
|
|||||||
return (2 - n1) * 100
|
return (2 - n1) * 100
|
||||||
|
|
||||||
|
|
||||||
def find_top_correlated_features(df, target_column, exclude_columns, top_n=10):
|
|
||||||
# Ensure the target column is not in the exclude list
|
|
||||||
exclude_columns = [col for col in exclude_columns if col != target_column]
|
|
||||||
|
|
||||||
# Select columns to consider for correlation
|
|
||||||
columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]]
|
|
||||||
|
|
||||||
# Calculate the correlation matrix
|
|
||||||
correlation_matrix = df[columns_to_consider + [target_column]].corr()
|
|
||||||
|
|
||||||
# Get correlations with the target column, excluding the target column itself
|
|
||||||
target_correlations = correlation_matrix[target_column].drop(target_column)
|
|
||||||
|
|
||||||
# Sort by absolute correlation value and select top N
|
|
||||||
top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n)
|
|
||||||
|
|
||||||
return top_correlated
|
|
||||||
|
|
||||||
|
|
||||||
async def download_data(ticker, con, start_date, end_date):
|
async def download_data(ticker, con, start_date, end_date):
|
||||||
@ -63,14 +46,13 @@ async def download_data(ticker, con, start_date, end_date):
|
|||||||
statements = [
|
statements = [
|
||||||
f"json/financial-statements/ratios/quarter/{ticker}.json",
|
f"json/financial-statements/ratios/quarter/{ticker}.json",
|
||||||
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
|
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
|
||||||
#f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
|
f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
|
||||||
#f"json/financial-statements/income-statement/quarter/{ticker}.json",
|
f"json/financial-statements/income-statement/quarter/{ticker}.json",
|
||||||
#f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
|
f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
|
||||||
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
|
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
|
||||||
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
|
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
|
||||||
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
|
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
|
||||||
#f"json/financial-statements/key-metrics/quarter/{ticker}.json",
|
f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
|
||||||
#f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Helper function to load JSON data asynchronously
|
# Helper function to load JSON data asynchronously
|
||||||
@ -91,35 +73,42 @@ async def download_data(ticker, con, start_date, end_date):
|
|||||||
ratios = await load_json_from_file(statements[0])
|
ratios = await load_json_from_file(statements[0])
|
||||||
ratios = await filter_data(ratios, ignore_keys)
|
ratios = await filter_data(ratios, ignore_keys)
|
||||||
|
|
||||||
|
#Threshold of enough datapoints needed!
|
||||||
|
if len(ratios) < 50:
|
||||||
|
return
|
||||||
|
|
||||||
key_metrics = await load_json_from_file(statements[1])
|
key_metrics = await load_json_from_file(statements[1])
|
||||||
key_metrics = await filter_data(key_metrics, ignore_keys)
|
key_metrics = await filter_data(key_metrics, ignore_keys)
|
||||||
|
|
||||||
'''
|
|
||||||
cashflow = await load_json_from_file(statements[1])
|
cashflow = await load_json_from_file(statements[2])
|
||||||
cashflow = await filter_data(cashflow, ignore_keys)
|
cashflow = await filter_data(cashflow, ignore_keys)
|
||||||
|
|
||||||
income = await load_json_from_file(statements[2])
|
income = await load_json_from_file(statements[3])
|
||||||
income = await filter_data(income, ignore_keys)
|
income = await filter_data(income, ignore_keys)
|
||||||
|
|
||||||
balance = await load_json_from_file(statements[3])
|
balance = await load_json_from_file(statements[4])
|
||||||
balance = await filter_data(balance, ignore_keys)
|
balance = await filter_data(balance, ignore_keys)
|
||||||
'''
|
|
||||||
income_growth = await load_json_from_file(statements[2])
|
income_growth = await load_json_from_file(statements[5])
|
||||||
income_growth = await filter_data(income_growth, ignore_keys)
|
income_growth = await filter_data(income_growth, ignore_keys)
|
||||||
|
|
||||||
balance_growth = await load_json_from_file(statements[3])
|
balance_growth = await load_json_from_file(statements[6])
|
||||||
balance_growth = await filter_data(balance_growth, ignore_keys)
|
balance_growth = await filter_data(balance_growth, ignore_keys)
|
||||||
|
|
||||||
|
|
||||||
cashflow_growth = await load_json_from_file(statements[4])
|
cashflow_growth = await load_json_from_file(statements[7])
|
||||||
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
|
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
|
||||||
|
|
||||||
|
owner_earnings = await load_json_from_file(statements[8])
|
||||||
|
owner_earnings = await filter_data(owner_earnings, ignore_keys)
|
||||||
|
|
||||||
|
|
||||||
# Combine all the data
|
# Combine all the data
|
||||||
combined_data = defaultdict(dict)
|
combined_data = defaultdict(dict)
|
||||||
|
|
||||||
# Merge the data based on 'date'
|
# Merge the data based on 'date'
|
||||||
for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth):
|
for entries in zip(ratios, key_metrics, cashflow, income, balance, income_growth, balance_growth, cashflow_growth, owner_earnings):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
date = entry['date']
|
date = entry['date']
|
||||||
for key, value in entry.items():
|
for key, value in entry.items():
|
||||||
@ -321,29 +310,6 @@ async def download_data(ticker, con, start_date, end_date):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def process_symbol(ticker, con, start_date, end_date):
|
|
||||||
try:
|
|
||||||
test_size = 0.2
|
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
||||||
predictor = ScorePredictor()
|
|
||||||
df = await download_data(ticker, con, start_date, end_date)
|
|
||||||
split_size = int(len(df) * (1-test_size))
|
|
||||||
test_data = df.iloc[split_size:]
|
|
||||||
selected_features = [col for col in df.columns if col not in ['date','price','Target']]
|
|
||||||
|
|
||||||
print(f"For the Ticker: {ticker}")
|
|
||||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
|
||||||
|
|
||||||
if len(data) != 0:
|
|
||||||
if data['precision'] >= 50 and data['accuracy'] >= 50:
|
|
||||||
res = {'score': data['score']}
|
|
||||||
await save_json(ticker, res)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||||
# Helper function to divide the tickers into chunks
|
# Helper function to divide the tickers into chunks
|
||||||
def chunks(lst, size):
|
def chunks(lst, size):
|
||||||
@ -362,97 +328,84 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
#Train mode
|
|
||||||
async def train_process(tickers, con):
|
|
||||||
tickers = list(set(tickers))
|
async def warm_start_training(tickers, con):
|
||||||
df_train = pd.DataFrame()
|
|
||||||
df_test = pd.DataFrame()
|
|
||||||
test_size = 0.2
|
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
df_train = pd.DataFrame()
|
|
||||||
df_test = pd.DataFrame()
|
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
|
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
|
||||||
|
|
||||||
train_list = []
|
df_train = pd.concat(dfs, ignore_index=True)
|
||||||
test_list = []
|
df_train = df_train.sample(frac=1).reset_index(drop=True)
|
||||||
|
|
||||||
for df in dfs:
|
print('======Warm Start Train Set Datapoints======')
|
||||||
try:
|
|
||||||
split_size = int(len(df) * (1 - test_size))
|
|
||||||
train_data = df.iloc[:split_size]
|
|
||||||
test_data = df.iloc[split_size:]
|
|
||||||
|
|
||||||
# Append to the lists
|
|
||||||
train_list.append(train_data)
|
|
||||||
test_list.append(test_data)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Concatenate all at once outside the loop
|
|
||||||
df_train = pd.concat(train_list, ignore_index=True)
|
|
||||||
df_test = pd.concat(test_list, ignore_index=True)
|
|
||||||
|
|
||||||
|
|
||||||
best_features = [col for col in df_train.columns if col not in ['date','price','Target']]
|
|
||||||
|
|
||||||
df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
|
|
||||||
top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price'])
|
|
||||||
print(top_correlated)
|
|
||||||
#print(df_train)
|
|
||||||
print('======Train Set Datapoints======')
|
|
||||||
print(len(df_train))
|
print(len(df_train))
|
||||||
|
|
||||||
predictor = ScorePredictor()
|
predictor = ScorePredictor()
|
||||||
#print(selected_features)
|
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
|
||||||
selected_features = [col for col in df_train if col not in ['price','date','Target']]
|
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
|
||||||
#best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=15)
|
|
||||||
#print(best_features)
|
return predictor
|
||||||
predictor.train_model(df_train[selected_features], df_train['Target'])
|
|
||||||
predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
|
||||||
|
|
||||||
|
async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||||
|
try:
|
||||||
|
df = await download_data(ticker, con, start_date, end_date)
|
||||||
|
if df is None or len(df) == 0:
|
||||||
|
print(f"No data available for {ticker}")
|
||||||
|
return
|
||||||
|
|
||||||
|
test_size = 0.2
|
||||||
|
split_size = int(len(df) * (1-test_size))
|
||||||
|
train_data = df.iloc[:split_size]
|
||||||
|
test_data = df.iloc[split_size:]
|
||||||
|
|
||||||
|
selected_features = [col for col in df.columns if col not in ['date','price','Target']]
|
||||||
|
# Fine-tune the model
|
||||||
|
predictor = ScorePredictor()
|
||||||
|
predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
|
||||||
|
|
||||||
|
print(f"Evaluating fine-tuned model for {ticker}")
|
||||||
|
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||||
|
|
||||||
|
if len(data) != 0:
|
||||||
|
if data['precision'] >= 50 and data['accuracy'] >= 50:
|
||||||
|
res = {'score': data['score']}
|
||||||
|
await save_json(ticker, res)
|
||||||
|
print(f"Saved results for {ticker}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {ticker}: {e}")
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
|
train_mode = True # Set this to False for fine-tuning and evaluation
|
||||||
train_mode = True
|
|
||||||
con = sqlite3.connect('stocks.db')
|
con = sqlite3.connect('stocks.db')
|
||||||
cursor = con.cursor()
|
cursor = con.cursor()
|
||||||
cursor.execute("PRAGMA journal_mode = wal")
|
cursor.execute("PRAGMA journal_mode = wal")
|
||||||
|
|
||||||
if train_mode:
|
if train_mode:
|
||||||
#Train first model
|
# Warm start training
|
||||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'")
|
warm_start_symbols = ['META', 'NFLX','GOOG','TSLA','AWR','AMD','NVDA']
|
||||||
stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()]
|
print('Warm Start Training for:', warm_start_symbols)
|
||||||
print('Number of Stocks')
|
predictor = await warm_start_training(warm_start_symbols, con)
|
||||||
print(len(stock_symbols))
|
else:
|
||||||
await train_process(stock_symbols, con)
|
# Fine-tuning and evaluation for all stocks
|
||||||
|
|
||||||
|
|
||||||
#Prediction Steps for all stock symbols
|
|
||||||
if not train_mode:
|
|
||||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
|
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
|
||||||
stock_symbols = [row[0] for row in cursor.fetchall()]
|
stock_symbols = ['NVDA'] #[row[0] for row in cursor.fetchall()]
|
||||||
total_symbols = stock_symbols
|
|
||||||
|
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
|
||||||
print(f"Total tickers: {len(total_symbols)}")
|
|
||||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||||
|
tasks = []
|
||||||
chunk_size = len(total_symbols) // 100 # Divide the list into N chunks
|
for ticker in tqdm(stock_symbols):
|
||||||
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
|
tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date))
|
||||||
for chunk in chunks:
|
|
||||||
tasks = []
|
await asyncio.gather(*tasks)
|
||||||
for ticker in tqdm(chunk):
|
|
||||||
tasks.append(process_symbol(ticker, con, start_date, end_date))
|
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
|
|
||||||
|
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
try:
|
|
||||||
asyncio.run(run())
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(run())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Main execution error: {e}")
|
||||||
|
|||||||
Binary file not shown.
@ -56,96 +56,72 @@ class SelfAttention(Layer):
|
|||||||
class ScorePredictor:
|
class ScorePredictor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scaler = MinMaxScaler()
|
self.scaler = MinMaxScaler()
|
||||||
self.model = self.build_model()
|
self.model = None
|
||||||
|
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras'
|
||||||
|
|
||||||
def build_model(self):
|
def build_model(self):
|
||||||
clear_session()
|
clear_session()
|
||||||
|
|
||||||
# Input layer
|
inputs = Input(shape=(335,))
|
||||||
inputs = Input(shape=(139,))
|
x = Dense(512, activation='elu')(inputs)
|
||||||
|
|
||||||
# First dense layer
|
|
||||||
x = Dense(128, activation='elu')(inputs)
|
|
||||||
x = Dropout(0.2)(x)
|
x = Dropout(0.2)(x)
|
||||||
x = BatchNormalization()(x)
|
x = BatchNormalization()(x)
|
||||||
|
|
||||||
# Additional dense layers
|
for units in [64, 32]:
|
||||||
for units in [64,32]:
|
|
||||||
x = Dense(units, activation='elu')(x)
|
x = Dense(units, activation='elu')(x)
|
||||||
x = Dropout(0.2)(x)
|
x = Dropout(0.2)(x)
|
||||||
x = BatchNormalization()(x)
|
x = BatchNormalization()(x)
|
||||||
|
|
||||||
# Reshape for attention mechanism
|
|
||||||
x = Reshape((32, 1))(x)
|
x = Reshape((32, 1))(x)
|
||||||
|
|
||||||
# Attention mechanism
|
|
||||||
#attention = Dense(32, activation='elu')(x)
|
|
||||||
#attention = Dense(1, activation='softmax')(attention)
|
|
||||||
|
|
||||||
# Apply attention
|
|
||||||
#x = Multiply()([x, attention])
|
|
||||||
|
|
||||||
x, _ = SelfAttention()(x)
|
x, _ = SelfAttention()(x)
|
||||||
|
outputs = Dense(2, activation='softmax')(x)
|
||||||
# Global average pooling
|
|
||||||
#x = GlobalAveragePooling1D()(x)
|
|
||||||
|
|
||||||
# Output layer (for class probabilities)
|
|
||||||
outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax
|
|
||||||
|
|
||||||
# Create the model
|
|
||||||
model = Model(inputs=inputs, outputs=outputs)
|
model = Model(inputs=inputs, outputs=outputs)
|
||||||
|
|
||||||
# Optimizer with a lower learning rate
|
|
||||||
optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
|
optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
|
||||||
|
|
||||||
# Compile the model
|
|
||||||
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def preprocess_data(self, X):
|
def preprocess_data(self, X):
|
||||||
# X = X.applymap(lambda x: 9999 if x == 0 else x) # Replace 0 with 9999 as suggested in the paper
|
|
||||||
X = np.where(np.isinf(X), np.nan, X)
|
X = np.where(np.isinf(X), np.nan, X)
|
||||||
X = np.nan_to_num(X)
|
X = np.nan_to_num(X)
|
||||||
X = self.scaler.fit_transform(X)
|
X = self.scaler.fit_transform(X)
|
||||||
return X
|
return X
|
||||||
|
|
||||||
def reshape_for_lstm(self, X):
|
def warm_start_training(self, X_train, y_train):
|
||||||
return X.reshape((X.shape[0], X.shape[1], 1))
|
|
||||||
|
|
||||||
def train_model(self, X_train, y_train):
|
|
||||||
X_train = self.preprocess_data(X_train)
|
X_train = self.preprocess_data(X_train)
|
||||||
#X_train = self.reshape_for_lstm(X_train)
|
self.model = self.build_model()
|
||||||
|
|
||||||
checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras',
|
checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min')
|
||||||
save_best_only=True, save_freq = 1,
|
|
||||||
monitor='val_loss', mode='min')
|
|
||||||
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
|
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
|
||||||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.001)
|
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.001)
|
||||||
|
|
||||||
self.model.fit(X_train, y_train, epochs=100_000, batch_size=32,
|
self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
|
||||||
validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
|
self.model.save(self.warm_start_model_path)
|
||||||
self.model.save('ml_models/weights/ai-score/weights.keras')
|
print("Warm start model saved.")
|
||||||
|
|
||||||
|
def fine_tune_model(self, X_train, y_train):
|
||||||
|
X_train = self.preprocess_data(X_train)
|
||||||
|
|
||||||
|
if self.model is None:
|
||||||
|
self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention})
|
||||||
|
|
||||||
|
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
|
||||||
|
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001)
|
||||||
|
|
||||||
|
self.model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping, reduce_lr])
|
||||||
|
print("Model fine-tuned (not saved).")
|
||||||
|
|
||||||
def evaluate_model(self, X_test, y_test):
|
def evaluate_model(self, X_test, y_test):
|
||||||
# Preprocess the test data
|
|
||||||
X_test = self.preprocess_data(X_test)
|
X_test = self.preprocess_data(X_test)
|
||||||
#X_test = self.reshape_for_lstm(X_test)
|
|
||||||
|
|
||||||
# Load the trained model
|
if self.model is None:
|
||||||
self.model = load_model('ml_models/weights/ai-score/weights.keras')
|
raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.")
|
||||||
|
|
||||||
# Get the model's predictions
|
|
||||||
test_predictions = self.model.predict(X_test)
|
test_predictions = self.model.predict(X_test)
|
||||||
print(test_predictions)
|
|
||||||
|
|
||||||
# Extract the probabilities for class 1 (index 1 in the softmax output)
|
|
||||||
class_1_probabilities = test_predictions[:, 1]
|
class_1_probabilities = test_predictions[:, 1]
|
||||||
# Convert probabilities to binary predictions using a threshold of 0.5
|
|
||||||
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
binary_predictions = (class_1_probabilities >= 0.5).astype(int)
|
||||||
|
print(test_predictions)
|
||||||
# Calculate precision and accuracy using binary predictions
|
|
||||||
test_precision = precision_score(y_test, binary_predictions)
|
test_precision = precision_score(y_test, binary_predictions)
|
||||||
test_accuracy = accuracy_score(y_test, binary_predictions)
|
test_accuracy = accuracy_score(y_test, binary_predictions)
|
||||||
|
|
||||||
@ -153,36 +129,29 @@ class ScorePredictor:
|
|||||||
print(f"Precision: {round(test_precision * 100)}%")
|
print(f"Precision: {round(test_precision * 100)}%")
|
||||||
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
print(f"Accuracy: {round(test_accuracy * 100)}%")
|
||||||
|
|
||||||
# Define thresholds and corresponding scores
|
|
||||||
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2]
|
thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2]
|
||||||
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
||||||
|
|
||||||
# Get the last prediction value (class 1 probability) for scoring
|
|
||||||
last_prediction_prob = class_1_probabilities[-1]
|
last_prediction_prob = class_1_probabilities[-1]
|
||||||
|
|
||||||
# Initialize score to 0 (or any default value)
|
|
||||||
score = 0
|
score = 0
|
||||||
print(last_prediction_prob)
|
print(f"Last prediction probability: {last_prediction_prob}")
|
||||||
# Determine the score based on the last prediction probability
|
|
||||||
for threshold, value in zip(thresholds, scores):
|
for threshold, value in zip(thresholds, scores):
|
||||||
if last_prediction_prob >= threshold:
|
if last_prediction_prob >= threshold:
|
||||||
score = value
|
score = value
|
||||||
break # Exit the loop once the score is determined
|
break
|
||||||
|
|
||||||
# Return the evaluation results
|
|
||||||
return {'accuracy': round(test_accuracy * 100),
|
return {'accuracy': round(test_accuracy * 100),
|
||||||
'precision': round(test_precision * 100),
|
'precision': round(test_precision * 100),
|
||||||
'score': score}
|
'score': score}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def feature_selection(self, X_train, y_train, k=100):
|
def feature_selection(self, X_train, y_train, k=100):
|
||||||
print('feature selection:')
|
print('Feature selection:')
|
||||||
print(X_train.shape, y_train.shape)
|
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
|
||||||
selector = SelectKBest(score_func=f_classif, k=k)
|
selector = SelectKBest(score_func=f_classif, k=k)
|
||||||
selector.fit(X_train, y_train)
|
selector.fit(X_train, y_train)
|
||||||
|
|
||||||
selector.transform(X_train)
|
selector.transform(X_train)
|
||||||
selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
|
selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]]
|
||||||
|
|
||||||
return selected_features
|
return selected_features
|
||||||
Loading…
x
Reference in New Issue
Block a user