diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 21b6d1e..41d13f5 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -38,23 +38,6 @@ def calculate_fdi(high, low, close, window=30): return (2 - n1) * 100 -def find_top_correlated_features(df, target_column, exclude_columns, top_n=10): - # Ensure the target column is not in the exclude list - exclude_columns = [col for col in exclude_columns if col != target_column] - - # Select columns to consider for correlation - columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]] - - # Calculate the correlation matrix - correlation_matrix = df[columns_to_consider + [target_column]].corr() - - # Get correlations with the target column, excluding the target column itself - target_correlations = correlation_matrix[target_column].drop(target_column) - - # Sort by absolute correlation value and select top N - top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n) - - return top_correlated async def download_data(ticker, con, start_date, end_date): @@ -63,14 +46,13 @@ async def download_data(ticker, con, start_date, end_date): statements = [ f"json/financial-statements/ratios/quarter/{ticker}.json", f"json/financial-statements/key-metrics/quarter/{ticker}.json", - #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json", - #f"json/financial-statements/income-statement/quarter/{ticker}.json", - #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json", + f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json", + f"json/financial-statements/income-statement/quarter/{ticker}.json", + f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json", f"json/financial-statements/income-statement-growth/quarter/{ticker}.json", f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json", f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json", - #f"json/financial-statements/key-metrics/quarter/{ticker}.json", - #f"json/financial-statements/owner-earnings/quarter/{ticker}.json", + f"json/financial-statements/owner-earnings/quarter/{ticker}.json", ] # Helper function to load JSON data asynchronously @@ -91,35 +73,42 @@ async def download_data(ticker, con, start_date, end_date): ratios = await load_json_from_file(statements[0]) ratios = await filter_data(ratios, ignore_keys) + #Threshold of enough datapoints needed! + if len(ratios) < 50: + return + key_metrics = await load_json_from_file(statements[1]) key_metrics = await filter_data(key_metrics, ignore_keys) - ''' - cashflow = await load_json_from_file(statements[1]) + + cashflow = await load_json_from_file(statements[2]) cashflow = await filter_data(cashflow, ignore_keys) - income = await load_json_from_file(statements[2]) + income = await load_json_from_file(statements[3]) income = await filter_data(income, ignore_keys) - balance = await load_json_from_file(statements[3]) + balance = await load_json_from_file(statements[4]) balance = await filter_data(balance, ignore_keys) - ''' - income_growth = await load_json_from_file(statements[2]) + + income_growth = await load_json_from_file(statements[5]) income_growth = await filter_data(income_growth, ignore_keys) - balance_growth = await load_json_from_file(statements[3]) + balance_growth = await load_json_from_file(statements[6]) balance_growth = await filter_data(balance_growth, ignore_keys) - cashflow_growth = await load_json_from_file(statements[4]) + cashflow_growth = await load_json_from_file(statements[7]) cashflow_growth = await filter_data(cashflow_growth, ignore_keys) + owner_earnings = await load_json_from_file(statements[8]) + owner_earnings = await filter_data(owner_earnings, ignore_keys) + # Combine all the data combined_data = defaultdict(dict) # Merge the data based on 'date' - for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth): + for entries in zip(ratios, key_metrics, cashflow, income, balance, income_growth, balance_growth, cashflow_growth, owner_earnings): for entry in entries: date = entry['date'] for key, value in entry.items(): @@ -321,29 +310,6 @@ async def download_data(ticker, con, start_date, end_date): pass -async def process_symbol(ticker, con, start_date, end_date): - try: - test_size = 0.2 - start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") - end_date = datetime.today().strftime("%Y-%m-%d") - predictor = ScorePredictor() - df = await download_data(ticker, con, start_date, end_date) - split_size = int(len(df) * (1-test_size)) - test_data = df.iloc[split_size:] - selected_features = [col for col in df.columns if col not in ['date','price','Target']] - - print(f"For the Ticker: {ticker}") - data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) - - if len(data) != 0: - if data['precision'] >= 50 and data['accuracy'] >= 50: - res = {'score': data['score']} - await save_json(ticker, res) - - except Exception as e: - print(e) - - async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): # Helper function to divide the tickers into chunks def chunks(lst, size): @@ -362,97 +328,84 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): return results -#Train mode -async def train_process(tickers, con): - tickers = list(set(tickers)) - df_train = pd.DataFrame() - df_test = pd.DataFrame() - test_size = 0.2 + + +async def warm_start_training(tickers, con): start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - df_train = pd.DataFrame() - df_test = pd.DataFrame() - + dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) - - train_list = [] - test_list = [] - - for df in dfs: - try: - split_size = int(len(df) * (1 - test_size)) - train_data = df.iloc[:split_size] - test_data = df.iloc[split_size:] - - # Append to the lists - train_list.append(train_data) - test_list.append(test_data) - except: - pass - - # Concatenate all at once outside the loop - df_train = pd.concat(train_list, ignore_index=True) - df_test = pd.concat(test_list, ignore_index=True) - - - best_features = [col for col in df_train.columns if col not in ['date','price','Target']] - - df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True) - top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price']) - print(top_correlated) - #print(df_train) - print('======Train Set Datapoints======') + + df_train = pd.concat(dfs, ignore_index=True) + df_train = df_train.sample(frac=1).reset_index(drop=True) + + print('======Warm Start Train Set Datapoints======') print(len(df_train)) - + predictor = ScorePredictor() - #print(selected_features) - selected_features = [col for col in df_train if col not in ['price','date','Target']] - #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=15) - #print(best_features) - predictor.train_model(df_train[selected_features], df_train['Target']) - predictor.evaluate_model(df_test[selected_features], df_test['Target']) + selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] + predictor.warm_start_training(df_train[selected_features], df_train['Target']) + + return predictor +async def fine_tune_and_evaluate(ticker, con, start_date, end_date): + try: + df = await download_data(ticker, con, start_date, end_date) + if df is None or len(df) == 0: + print(f"No data available for {ticker}") + return + + test_size = 0.2 + split_size = int(len(df) * (1-test_size)) + train_data = df.iloc[:split_size] + test_data = df.iloc[split_size:] + + selected_features = [col for col in df.columns if col not in ['date','price','Target']] + # Fine-tune the model + predictor = ScorePredictor() + predictor.fine_tune_model(train_data[selected_features], train_data['Target']) + + print(f"Evaluating fine-tuned model for {ticker}") + data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) + + if len(data) != 0: + if data['precision'] >= 50 and data['accuracy'] >= 50: + res = {'score': data['score']} + await save_json(ticker, res) + print(f"Saved results for {ticker}") + + except Exception as e: + print(f"Error processing {ticker}: {e}") async def run(): - - train_mode = True + train_mode = True # Set this to False for fine-tuning and evaluation con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") if train_mode: - #Train first model - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'") - stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()] - print('Number of Stocks') - print(len(stock_symbols)) - await train_process(stock_symbols, con) - - - #Prediction Steps for all stock symbols - if not train_mode: + # Warm start training + warm_start_symbols = ['META', 'NFLX','GOOG','TSLA','AWR','AMD','NVDA'] + print('Warm Start Training for:', warm_start_symbols) + predictor = await warm_start_training(warm_start_symbols, con) + else: + # Fine-tuning and evaluation for all stocks cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] - total_symbols = stock_symbols - - print(f"Total tickers: {len(total_symbols)}") + stock_symbols = ['NVDA'] #[row[0] for row in cursor.fetchall()] + + print(f"Total tickers for fine-tuning: {len(stock_symbols)}") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - - chunk_size = len(total_symbols) // 100 # Divide the list into N chunks - chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] - for chunk in chunks: - tasks = [] - for ticker in tqdm(chunk): - tasks.append(process_symbol(ticker, con, start_date, end_date)) - - await asyncio.gather(*tasks) + tasks = [] + for ticker in tqdm(stock_symbols): + tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date)) + + await asyncio.gather(*tasks) - con.close() - -try: - asyncio.run(run()) -except Exception as e: - print(e) +if __name__ == "__main__": + try: + asyncio.run(run()) + except Exception as e: + print(f"Main execution error: {e}") diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 4655804..cc99f0d 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index b2f47e7..a68d750 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -56,96 +56,72 @@ class SelfAttention(Layer): class ScorePredictor: def __init__(self): self.scaler = MinMaxScaler() - self.model = self.build_model() + self.model = None + self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras' def build_model(self): clear_session() - # Input layer - inputs = Input(shape=(139,)) - - # First dense layer - x = Dense(128, activation='elu')(inputs) + inputs = Input(shape=(335,)) + x = Dense(512, activation='elu')(inputs) x = Dropout(0.2)(x) x = BatchNormalization()(x) - # Additional dense layers - for units in [64,32]: + for units in [64, 32]: x = Dense(units, activation='elu')(x) x = Dropout(0.2)(x) x = BatchNormalization()(x) - # Reshape for attention mechanism x = Reshape((32, 1))(x) - - # Attention mechanism - #attention = Dense(32, activation='elu')(x) - #attention = Dense(1, activation='softmax')(attention) - - # Apply attention - #x = Multiply()([x, attention]) - x, _ = SelfAttention()(x) - - # Global average pooling - #x = GlobalAveragePooling1D()(x) + outputs = Dense(2, activation='softmax')(x) - # Output layer (for class probabilities) - outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax - - # Create the model model = Model(inputs=inputs, outputs=outputs) - - # Optimizer with a lower learning rate optimizer = Adam(learning_rate=0.01, clipnorm=1.0) - - # Compile the model model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model def preprocess_data(self, X): - # X = X.applymap(lambda x: 9999 if x == 0 else x) # Replace 0 with 9999 as suggested in the paper X = np.where(np.isinf(X), np.nan, X) X = np.nan_to_num(X) X = self.scaler.fit_transform(X) return X - def reshape_for_lstm(self, X): - return X.reshape((X.shape[0], X.shape[1], 1)) - - def train_model(self, X_train, y_train): + def warm_start_training(self, X_train, y_train): X_train = self.preprocess_data(X_train) - #X_train = self.reshape_for_lstm(X_train) + self.model = self.build_model() - checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', - save_best_only=True, save_freq = 1, - monitor='val_loss', mode='min') + checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min') early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.001) - self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, - validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) - self.model.save('ml_models/weights/ai-score/weights.keras') + self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) + self.model.save(self.warm_start_model_path) + print("Warm start model saved.") + + def fine_tune_model(self, X_train, y_train): + X_train = self.preprocess_data(X_train) + + if self.model is None: + self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention}) + + early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001) + + self.model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping, reduce_lr]) + print("Model fine-tuned (not saved).") def evaluate_model(self, X_test, y_test): - # Preprocess the test data X_test = self.preprocess_data(X_test) - #X_test = self.reshape_for_lstm(X_test) - # Load the trained model - self.model = load_model('ml_models/weights/ai-score/weights.keras') + if self.model is None: + raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.") - # Get the model's predictions test_predictions = self.model.predict(X_test) - print(test_predictions) - - # Extract the probabilities for class 1 (index 1 in the softmax output) class_1_probabilities = test_predictions[:, 1] - # Convert probabilities to binary predictions using a threshold of 0.5 binary_predictions = (class_1_probabilities >= 0.5).astype(int) - - # Calculate precision and accuracy using binary predictions + print(test_predictions) test_precision = precision_score(y_test, binary_predictions) test_accuracy = accuracy_score(y_test, binary_predictions) @@ -153,36 +129,29 @@ class ScorePredictor: print(f"Precision: {round(test_precision * 100)}%") print(f"Accuracy: {round(test_accuracy * 100)}%") - # Define thresholds and corresponding scores thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2] scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - # Get the last prediction value (class 1 probability) for scoring last_prediction_prob = class_1_probabilities[-1] - - # Initialize score to 0 (or any default value) score = 0 - print(last_prediction_prob) - # Determine the score based on the last prediction probability + print(f"Last prediction probability: {last_prediction_prob}") + for threshold, value in zip(thresholds, scores): if last_prediction_prob >= threshold: score = value - break # Exit the loop once the score is determined + break - # Return the evaluation results return {'accuracy': round(test_accuracy * 100), 'precision': round(test_precision * 100), 'score': score} - - def feature_selection(self, X_train, y_train, k=100): - print('feature selection:') - print(X_train.shape, y_train.shape) + print('Feature selection:') + print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") selector = SelectKBest(score_func=f_classif, k=k) selector.fit(X_train, y_train) selector.transform(X_train) selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]] - return selected_features + return selected_features \ No newline at end of file