diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 55b1986..815e102 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -316,7 +316,9 @@ async def process_symbol(ticker, con, start_date, end_date): df = await download_data(ticker, con, start_date, end_date) split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] - best_features = [col for col in df.columns if col not in ['date','price','Target']] + #selected_features = [col for col in df.columns if col not in ['date','price','Target']] + best_features = ['freeCashFlowYield', 'cci', 'daily_return', 'cashAndCashEquivalents_to_cashAndShortTermInvestments', 'longTermDebt_to_totalLiabilitiesAndStockholdersEquity', 'longTermDebt_to_totalAssets', 'totalStockholdersEquity_to_totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity_to_totalAssets'] + print(f"For the Ticker: {ticker}") data = predictor.evaluate_model(test_data[best_features], test_data['Target']) if len(data) != 0: @@ -359,16 +361,25 @@ async def train_process(tickers, con): dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) + train_list = [] + test_list = [] + for df in dfs: try: - split_size = int(len(df) * (1-test_size)) + split_size = int(len(df) * (1 - test_size)) train_data = df.iloc[:split_size] test_data = df.iloc[split_size:] - df_train = pd.concat([df_train, train_data], ignore_index=True) - df_test = pd.concat([df_test, test_data], ignore_index=True) + + # Append to the lists + train_list.append(train_data) + test_list.append(test_data) except: pass + # Concatenate all at once outside the loop + df_train = pd.concat(train_list, ignore_index=True) + df_test = pd.concat(test_list, ignore_index=True) + best_features = [col for col in df_train.columns if col not in ['date','price','Target']] @@ -380,49 +391,48 @@ async def train_process(tickers, con): predictor = ScorePredictor() #print(selected_features) selected_features = [col for col in df_train if col not in ['price','date','Target']] - #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=5) - #print(best_features) - predictor.train_model(df_train[selected_features], df_train['Target']) + best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=8) + print(best_features) + predictor.train_model(df_train[best_features], df_train['Target']) predictor.evaluate_model(df_test[best_features], df_test['Target']) async def run(): - #Train first model - + train_mode = False con = sqlite3.connect('stocks.db') - cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] #['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] - stock_symbols = list(set(stock_symbols)) - print('Number of Stocks') - print(len(stock_symbols)) - await train_process(stock_symbols, con) - - + if train_mode: + #Train first model + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 50E9 AND symbol NOT LIKE '%.%'") + stock_symbols = [row[0] for row in cursor.fetchall()] + print('Number of Stocks') + print(len(stock_symbols)) + await train_process(stock_symbols, con) #Prediction Steps for all stock symbols - #cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") - #stock_symbols = [row[0] for row in cursor.fetchall()] - total_symbols = stock_symbols + if not train_mode: + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") + stock_symbols = [row[0] for row in cursor.fetchall()] + total_symbols = stock_symbols - print(f"Total tickers: {len(total_symbols)}") - start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") - end_date = datetime.today().strftime("%Y-%m-%d") + print(f"Total tickers: {len(total_symbols)}") + start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") + end_date = datetime.today().strftime("%Y-%m-%d") - chunk_size = len(total_symbols)# // 100 # Divide the list into N chunks - chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] - for chunk in chunks: - tasks = [] - for ticker in tqdm(chunk): - tasks.append(process_symbol(ticker, con, start_date, end_date)) + chunk_size = len(total_symbols)// 100 # Divide the list into N chunks + chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] + for chunk in chunks: + tasks = [] + for ticker in tqdm(chunk): + tasks.append(process_symbol(ticker, con, start_date, end_date)) - await asyncio.gather(*tasks) + await asyncio.gather(*tasks) + con.close() try: diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index bc1b977..0eedab5 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 6161b9c..27d8753 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -38,24 +38,24 @@ class ScorePredictor: clear_session() # Input layer - inputs = Input(shape=(2139,)) + inputs = Input(shape=(8,)) # First dense layer - x = Dense(2048, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs) - x = Dropout(0.3)(x) + x = Dense(512, activation='relu')(inputs) + x = Dropout(0.5)(x) x = BatchNormalization()(x) # Additional dense layers - for units in [1024,512, 256, 256]: - x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x) - x = Dropout(0.2)(x) + for units in [256,128]: + x = Dense(units, activation='relu')(x) + x = Dropout(0.5)(x) x = BatchNormalization()(x) # Reshape for attention mechanism - x = Reshape((256, 1))(x) + x = Reshape((128, 1))(x) # Attention mechanism - attention = Dense(256, activation='relu')(x) + attention = Dense(128, activation='relu')(x) attention = Dense(1, activation='softmax')(attention) # Apply attention @@ -95,8 +95,8 @@ class ScorePredictor: checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', save_best_only=True, save_freq = 1, monitor='val_loss', mode='min') - early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=80, min_lr=0.00001) + early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=30, min_lr=0.001) self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr]) @@ -136,7 +136,7 @@ class ScorePredictor: # Initialize score to 0 (or any default value) score = 0 - #print(last_prediction_prob) + print(last_prediction_prob) # Determine the score based on the last prediction probability for threshold, value in zip(thresholds, scores): if last_prediction_prob >= threshold: