diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 5a3705a..21b6d1e 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -38,11 +38,24 @@ def calculate_fdi(high, low, close, window=30): return (2 - n1) * 100 -def hurst_exponent(ts, max_lag=100): - lags = range(2, max_lag) - tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags] - poly = np.polyfit(np.log(lags), np.log(tau), 1) - return poly[0] * 2.0 +def find_top_correlated_features(df, target_column, exclude_columns, top_n=10): + # Ensure the target column is not in the exclude list + exclude_columns = [col for col in exclude_columns if col != target_column] + + # Select columns to consider for correlation + columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]] + + # Calculate the correlation matrix + correlation_matrix = df[columns_to_consider + [target_column]].corr() + + # Get correlations with the target column, excluding the target column itself + target_correlations = correlation_matrix[target_column].drop(target_column) + + # Sort by absolute correlation value and select top N + top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n) + + return top_correlated + async def download_data(ticker, con, start_date, end_date): try: @@ -53,9 +66,9 @@ async def download_data(ticker, con, start_date, end_date): #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json", #f"json/financial-statements/income-statement/quarter/{ticker}.json", #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json", - #f"json/financial-statements/income-statement-growth/quarter/{ticker}.json", - #f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json", - #f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/income-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json", #f"json/financial-statements/key-metrics/quarter/{ticker}.json", #f"json/financial-statements/owner-earnings/quarter/{ticker}.json", ] @@ -90,26 +103,23 @@ async def download_data(ticker, con, start_date, end_date): balance = await load_json_from_file(statements[3]) balance = await filter_data(balance, ignore_keys) - - income_growth = await load_json_from_file(statements[4]) + ''' + income_growth = await load_json_from_file(statements[2]) income_growth = await filter_data(income_growth, ignore_keys) - balance_growth = await load_json_from_file(statements[5]) + balance_growth = await load_json_from_file(statements[3]) balance_growth = await filter_data(balance_growth, ignore_keys) - cashflow_growth = await load_json_from_file(statements[6]) + cashflow_growth = await load_json_from_file(statements[4]) cashflow_growth = await filter_data(cashflow_growth, ignore_keys) - owner_earnings = await load_json_from_file(statements[7]) - owner_earnings = await filter_data(owner_earnings, ignore_keys) - ''' # Combine all the data combined_data = defaultdict(dict) # Merge the data based on 'date' - for entries in zip(ratios, key_metrics): + for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth): for entry in entries: date = entry['date'] for key, value in entry.items(): @@ -117,8 +127,6 @@ async def download_data(ticker, con, start_date, end_date): combined_data[date][key] = value combined_data = list(combined_data.values()) - #Generate more features - #combined_data = calculate_combinations(combined_data) # Download historical stock data using yfinance df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index() @@ -386,11 +394,13 @@ async def train_process(tickers, con): df_train = pd.concat(train_list, ignore_index=True) df_test = pd.concat(test_list, ignore_index=True) - + best_features = [col for col in df_train.columns if col not in ['date','price','Target']] df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True) - print(df_train) + top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price']) + print(top_correlated) + #print(df_train) print('======Train Set Datapoints======') print(len(df_train)) @@ -405,7 +415,7 @@ async def train_process(tickers, con): async def run(): - train_mode = False + train_mode = True con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") @@ -413,7 +423,7 @@ async def run(): if train_mode: #Train first model cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] + stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()] print('Number of Stocks') print(len(stock_symbols)) await train_process(stock_symbols, con) diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 5fd2d46..4655804 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 2d35fcc..b2f47e7 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -16,7 +16,7 @@ from sklearn.feature_selection import SelectKBest, f_classif from tensorflow.keras.backend import clear_session from keras import regularizers from keras.layers import Layer - +from tensorflow.keras import backend as K from tqdm import tqdm from collections import defaultdict @@ -26,7 +26,31 @@ import aiofiles import pickle import time -# Based on the paper: https://arxiv.org/pdf/1603.00751 +class SelfAttention(Layer): + def __init__(self, **kwargs): + super(SelfAttention, self).__init__(**kwargs) + + def build(self, input_shape): + self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), + initializer='random_normal', trainable=True) + super(SelfAttention, self).build(input_shape) + + def call(self, x): + # Alignment scores. Pass them through tanh function + e = K.tanh(K.dot(x, self.W)) + # Remove dimension of size 1 + e = K.squeeze(e, axis=-1) + # Compute the weights + alpha = K.softmax(e) + # Reshape to tensor of same shape as x for multiplication + alpha = K.expand_dims(alpha, axis=-1) + # Compute the context vector + context = x * alpha + context = K.sum(context, axis=1) + return context, alpha + + def compute_output_shape(self, input_shape): + return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1]) class ScorePredictor: @@ -41,28 +65,30 @@ class ScorePredictor: inputs = Input(shape=(139,)) # First dense layer - x = Dense(128, activation='leaky_relu')(inputs) - x = Dropout(0.5)(x) + x = Dense(128, activation='elu')(inputs) + x = Dropout(0.2)(x) x = BatchNormalization()(x) # Additional dense layers for units in [64,32]: - x = Dense(units, activation='leaky_relu')(x) - x = Dropout(0.3)(x) + x = Dense(units, activation='elu')(x) + x = Dropout(0.2)(x) x = BatchNormalization()(x) # Reshape for attention mechanism x = Reshape((32, 1))(x) # Attention mechanism - attention = Dense(32, activation='leaky_relu')(x) - attention = Dense(1, activation='softmax')(attention) + #attention = Dense(32, activation='elu')(x) + #attention = Dense(1, activation='softmax')(attention) # Apply attention - x = Multiply()([x, attention]) + #x = Multiply()([x, attention]) + x, _ = SelfAttention()(x) + # Global average pooling - x = GlobalAveragePooling1D()(x) + #x = GlobalAveragePooling1D()(x) # Output layer (for class probabilities) outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax diff --git a/app/restart_json.py b/app/restart_json.py index 31adb6b..16263c9 100755 --- a/app/restart_json.py +++ b/app/restart_json.py @@ -15,6 +15,7 @@ import re import hashlib import glob from tqdm import tqdm +from utils.country_list import country_list from dotenv import load_dotenv import os diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/__pycache__/__init__.cpython-310.pyc b/app/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..249ad76 Binary files /dev/null and b/app/utils/__pycache__/__init__.cpython-310.pyc differ