diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index f039203..55b1986 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -22,7 +22,7 @@ import gc gc.enable() async def save_json(symbol, data): - with open(f"json/ai-score/{symbol}.json", 'wb') as file: + with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file: file.write(orjson.dumps(data)) @@ -317,23 +317,35 @@ async def process_symbol(ticker, con, start_date, end_date): split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] best_features = [col for col in df.columns if col not in ['date','price','Target']] - data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target']) - - print(data) - ''' - output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} - for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])] - ''' - #print(output_list) + data = predictor.evaluate_model(test_data[best_features], test_data['Target']) if len(data) != 0: if data['precision'] >= 50 and data['accuracy'] >= 50: - await save_json(ticker, data) + res = {'score': data['score']} + await save_json(ticker, res) except Exception as e: print(e) +async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): + # Helper function to divide the tickers into chunks + def chunks(lst, size): + for i in range(0, len(lst), size): + yield lst[i:i+size] + + results = [] + + for chunk in chunks(tickers, chunk_size): + # Create tasks for each chunk + tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk] + # Await the results for the current chunk + chunk_results = await asyncio.gather(*tasks) + # Accumulate the results + results.extend(chunk_results) + + return results + #Train mode async def train_process(tickers, con): tickers = list(set(tickers)) @@ -345,8 +357,8 @@ async def train_process(tickers, con): df_train = pd.DataFrame() df_test = pd.DataFrame() - tasks = [download_data(ticker, con, start_date, end_date) for ticker in tickers] - dfs = await asyncio.gather(*tasks) + dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) + for df in dfs: try: split_size = int(len(df) * (1-test_size)) @@ -373,17 +385,6 @@ async def train_process(tickers, con): predictor.train_model(df_train[selected_features], df_train['Target']) predictor.evaluate_model(df_test[best_features], df_test['Target']) -async def test_process(con): - test_size = 0.2 - start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") - end_date = datetime.today().strftime("%Y-%m-%d") - predictor = ScorePredictor() - df = await download_data('GME', con, start_date, end_date) - split_size = int(len(df) * (1-test_size)) - test_data = df.iloc[split_size:] - selected_features = [col for col in test_data if col not in ['price','date','Target']] - predictor.evaluate_model(test_data[selected_features], test_data['Target']) - async def run(): @@ -393,21 +394,22 @@ async def run(): cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'") - stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()] + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") + stock_symbols = [row[0] for row in cursor.fetchall()] #['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] stock_symbols = list(set(stock_symbols)) print('Number of Stocks') print(len(stock_symbols)) - #await train_process(stock_symbols, con) + await train_process(stock_symbols, con) + + + #Prediction Steps for all stock symbols - - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") - stock_symbols = [row[0] for row in cursor.fetchall()] + #cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") + #stock_symbols = [row[0] for row in cursor.fetchall()] + total_symbols = stock_symbols - total_symbols = ['GME'] #stock_symbols - print(f"Total tickers: {len(total_symbols)}") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 0525535..bc1b977 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index ced08bb..6161b9c 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -41,12 +41,12 @@ class ScorePredictor: inputs = Input(shape=(2139,)) # First dense layer - x = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) + x = Dense(2048, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) x = Dropout(0.3)(x) x = BatchNormalization()(x) # Additional dense layers - for units in [512,256, 256]: + for units in [1024,512, 256, 256]: x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x) x = Dropout(0.2)(x) x = BatchNormalization()(x) @@ -64,17 +64,17 @@ class ScorePredictor: # Global average pooling x = GlobalAveragePooling1D()(x) - # Output layer - outputs = Dense(1, activation='sigmoid')(x) + # Output layer (for class probabilities) + outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax # Create the model model = Model(inputs=inputs, outputs=outputs) # Optimizer with a lower learning rate - optimizer = Adam(learning_rate=0.1, clipnorm = 1.0) + optimizer = Adam(learning_rate=0.001, clipnorm=1.0) # Compile the model - model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) + model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model @@ -92,38 +92,63 @@ class ScorePredictor: X_train = self.preprocess_data(X_train) #X_train = self.reshape_for_lstm(X_train) - checkpoint = ModelCheckpoint('ml_models/weights/fundamental_weights/weights.keras', + checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', save_best_only=True, save_freq = 1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=60, min_lr=0.00001) + early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=80, min_lr=0.00001) self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) - self.model.save('ml_models/weights/fundamental_weights/weights.keras') + self.model.save('ml_models/weights/ai-score/weights.keras') def evaluate_model(self, X_test, y_test): + # Preprocess the test data X_test = self.preprocess_data(X_test) - X_test = self.reshape_for_lstm(X_test) + #X_test = self.reshape_for_lstm(X_test) - self.model = load_model('ml_models/weights/fundamental_weights/weights.keras') + # Load the trained model + self.model = load_model('ml_models/weights/ai-score/weights.keras') - test_predictions = self.model.predict(X_test).flatten() + # Get the model's predictions + test_predictions = self.model.predict(X_test) + #print(test_predictions) + + # Extract the probabilities for class 1 (index 1 in the softmax output) + class_1_probabilities = test_predictions[:, 1] + # Convert probabilities to binary predictions using a threshold of 0.5 + binary_predictions = (class_1_probabilities >= 0.5).astype(int) - test_predictions[test_predictions >= 0.5] = 1 - test_predictions[test_predictions < 0.5] = 0 - - test_precision = precision_score(y_test, test_predictions) - test_accuracy = accuracy_score(y_test, test_predictions) + # Calculate precision and accuracy using binary predictions + test_precision = precision_score(y_test, binary_predictions) + test_accuracy = accuracy_score(y_test, binary_predictions) print("Test Set Metrics:") print(f"Precision: {round(test_precision * 100)}%") print(f"Accuracy: {round(test_accuracy * 100)}%") - next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0 + # Define thresholds and corresponding scores + thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2] + scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + + # Get the last prediction value (class 1 probability) for scoring + last_prediction_prob = class_1_probabilities[-1] + + # Initialize score to 0 (or any default value) + score = 0 + #print(last_prediction_prob) + # Determine the score based on the last prediction probability + for threshold, value in zip(thresholds, scores): + if last_prediction_prob >= threshold: + score = value + break # Exit the loop once the score is determined + + # Return the evaluation results return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), - 'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions + 'score': score} + + def feature_selection(self, X_train, y_train, k=100): print('feature selection:')