From 8872b5d1c7f9d3294c1eb8995a575025925e361e Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Tue, 1 Oct 2024 12:31:52 +0200 Subject: [PATCH] bugfixing stocks screener --- app/cron_ai_score.py | 54 +++++++++++------- .../__pycache__/score_model.cpython-310.pyc | Bin 5131 -> 6224 bytes app/ml_models/score_model.py | 46 +++++++++++---- app/restart_json.py | 1 + app/utils/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 146 bytes 6 files changed, 69 insertions(+), 32 deletions(-) create mode 100644 app/utils/__init__.py create mode 100644 app/utils/__pycache__/__init__.cpython-310.pyc diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 5a3705a..21b6d1e 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -38,11 +38,24 @@ def calculate_fdi(high, low, close, window=30): return (2 - n1) * 100 -def hurst_exponent(ts, max_lag=100): - lags = range(2, max_lag) - tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags] - poly = np.polyfit(np.log(lags), np.log(tau), 1) - return poly[0] * 2.0 +def find_top_correlated_features(df, target_column, exclude_columns, top_n=10): + # Ensure the target column is not in the exclude list + exclude_columns = [col for col in exclude_columns if col != target_column] + + # Select columns to consider for correlation + columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]] + + # Calculate the correlation matrix + correlation_matrix = df[columns_to_consider + [target_column]].corr() + + # Get correlations with the target column, excluding the target column itself + target_correlations = correlation_matrix[target_column].drop(target_column) + + # Sort by absolute correlation value and select top N + top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n) + + return top_correlated + async def download_data(ticker, con, start_date, end_date): try: @@ -53,9 +66,9 @@ async def download_data(ticker, con, start_date, end_date): #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json", #f"json/financial-statements/income-statement/quarter/{ticker}.json", #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json", - #f"json/financial-statements/income-statement-growth/quarter/{ticker}.json", - #f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json", - #f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/income-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json", + f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json", #f"json/financial-statements/key-metrics/quarter/{ticker}.json", #f"json/financial-statements/owner-earnings/quarter/{ticker}.json", ] @@ -90,26 +103,23 @@ async def download_data(ticker, con, start_date, end_date): balance = await load_json_from_file(statements[3]) balance = await filter_data(balance, ignore_keys) - - income_growth = await load_json_from_file(statements[4]) + ''' + income_growth = await load_json_from_file(statements[2]) income_growth = await filter_data(income_growth, ignore_keys) - balance_growth = await load_json_from_file(statements[5]) + balance_growth = await load_json_from_file(statements[3]) balance_growth = await filter_data(balance_growth, ignore_keys) - cashflow_growth = await load_json_from_file(statements[6]) + cashflow_growth = await load_json_from_file(statements[4]) cashflow_growth = await filter_data(cashflow_growth, ignore_keys) - owner_earnings = await load_json_from_file(statements[7]) - owner_earnings = await filter_data(owner_earnings, ignore_keys) - ''' # Combine all the data combined_data = defaultdict(dict) # Merge the data based on 'date' - for entries in zip(ratios, key_metrics): + for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth): for entry in entries: date = entry['date'] for key, value in entry.items(): @@ -117,8 +127,6 @@ async def download_data(ticker, con, start_date, end_date): combined_data[date][key] = value combined_data = list(combined_data.values()) - #Generate more features - #combined_data = calculate_combinations(combined_data) # Download historical stock data using yfinance df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index() @@ -386,11 +394,13 @@ async def train_process(tickers, con): df_train = pd.concat(train_list, ignore_index=True) df_test = pd.concat(test_list, ignore_index=True) - + best_features = [col for col in df_train.columns if col not in ['date','price','Target']] df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True) - print(df_train) + top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price']) + print(top_correlated) + #print(df_train) print('======Train Set Datapoints======') print(len(df_train)) @@ -405,7 +415,7 @@ async def train_process(tickers, con): async def run(): - train_mode = False + train_mode = True con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") @@ -413,7 +423,7 @@ async def run(): if train_mode: #Train first model cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] + stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()] print('Number of Stocks') print(len(stock_symbols)) await train_process(stock_symbols, con) diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 5fd2d46577b3643814e87d05cdaeea5f128b2944..465580422ecf21e4609a321a035bc3d0daaba02f 100644 GIT binary patch delta 2429 zcmZuzOKcQJ5bf@n-5u|G*Xs{95JC(QpW$O!fe>RGn@DkcBWBZ_nCe&+eF> zS+K3a8_9@#EL)1^LO4Vw$^}I^5H94SNI53TAxb#4azM%f`IacBC`hELZ38B0tKU?2 zS65fPt}gHF_Mba<`q^xnK-+uwpAvhXkl%6ebkkvQ5PI<8?VVexyRYyHC7kdkudCcu z=u&dCx4+z7=vFfIdI~+TPQBhjulgP+98llBLZABf7y6G8-on>hC47yucSb85gmEix zgK^soD;(nVG%?rS`Go#Kjnt4`9CIo>p4E2M(5;{gz##?7NrAmbN|aO1t{MeR$r{%| z)_H;_uaZK7r?>&%j6R%(umBk zQO2oDqGR+kI!`f5u*3##5mS?ipgQgdu^qIfVOeg)4K2%LQV-x5nH;-pi{aoFm35X? z^z0z8EMdme`k|WG30pSeNqFpfI@P+$v5pm6MOQZ&5m2y>aIM6+ z3VlwGh($vdYUurf9?_dCZl?xh-iGLFDtzm*;|`C6i;ti!A{8<#>c=(+Z!}> zsaNc>196Vn)rNLZ)rbm(59fM(hA%OROY@s;yL4$>4q%ALKutjSjS9F+7MLO4%x*q8fkPEl2 z4~UoG0Fz4Ho^S)P2M30v$E6-UEupCQ6H&~7ubgF4J zN2t1rqLtKE6t%{cvKxC_@P5$EDh(}&kwgn5A}B{m>nsTPCr-|06utfon3R2j6&K8Jg}!!xI7CPZb7 zMNG%v*I%I1aeH=S)9@bEe(e`;!;+a6wfJOqTif}^%2-+iJ~~qqL-BleZALlq0!|Oc zx3l;5T?Y~B?g&Qjdt4?i*6Rv<7ks zH@-v-WxH{C!CKOUyuK4Z%=gyjK?9Ajkt4#Gr<5bQmc^Gjh33wQ3%g#$*ZpQB{hLWRs=%r^c75dxtfR&`z0$9|D0d zN}hkYW6TkDAhzI1{RmiWWv1wR7$USA4H| Sdo!(P^-!i){_?EpxqkttJx7}W delta 1360 zcmZuwO>7%Q6y8~{?e*I6U+koa2GXLY-lhqSN`CFcv_({?Dy2U{)J0t_p0%@N?G3Zz zDpZbw6yOs1!bnKr)I$&L1qnr5Dsk$m5=f{Z5UUDRLZbMV_Q0V+g7?;G5D{;+-@JK0 zGw*M{KekcGxP5(^1XO(bw6?uZl716Q>T{&|)#wO0J6V||5|wDynQBZ|rURZhGnE;T z5ofkC8?1AcxnM0;iosf{luk>upXNT6Xl`Au%u{kv((_w)$TksQhjg)~exvz{XF0Xo z5*rgYl@oDaH4M9HbHg|w-j0>YAL8@ar8RY(wB>cODsyFB>Tw?o@i2|;%4Moig@!wF zpjEg`<1`WM74TDd1pV4dy?09V*(qSxw_}HzFjw289j3{Syc*##nxbi%>BwuV(RSp0 ztsSLV+FvI$*9j4+9l>PY)Jb;(s-_b^>@AwhR`(;=`I=*yOCK7{a$2C#%qq99ncQ}p zk3b9bu&m47SAppc13Wq&&<`MF(B{1kwy_px2hkvnkV6#;xi+iLX<`X*6MLjVz{9jK$walz#=2k7wGpNbU z^WtX0D`JAW>_?dul#fl|CFc&cg<*d+LM zaf<7I-F|%k?0avOb%kAqy&tJT4_wCH4K@wCX*kTj@hO2ai=UGx^XEa%h7k56ynrw& z##86^?+GK$nF-zvms-x1t#33+SgZ zn?Zd~e6Or)?5g-Ab9k=!?3C``KnzDf53v)l_%Wzl)%Kt{Fl1GiS$doW0l$boqvH3> zk5emwAz#EQxeoQi*G#8n?JZ)`Lb1uMUIMn*&Q?f6oaui+#zY}EHV|ZsjrO&YV|yH@ zX5ofd%ALwz$IF8D3?p{{!4h|KM_#KVyNJ*M@H6nDttz*uQL{|mf{1!ye1;am*_j=M z(X)iaHpN7~6ls8ku8AknLswo#V;DB6m+3|190YuAY!l(%&g@3b4ufJCeh(jqDyc+O z)T3%t4XMfGu|TUYh+p%k$QxS+2Np?=;VAp@s_S6L5QxV{#GS$M0v_~#H~4A^df#%a V1?-UbipcLPq(TV-H|>1c@IUmcUTXjV diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index 2d35fcc..b2f47e7 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -16,7 +16,7 @@ from sklearn.feature_selection import SelectKBest, f_classif from tensorflow.keras.backend import clear_session from keras import regularizers from keras.layers import Layer - +from tensorflow.keras import backend as K from tqdm import tqdm from collections import defaultdict @@ -26,7 +26,31 @@ import aiofiles import pickle import time -# Based on the paper: https://arxiv.org/pdf/1603.00751 +class SelfAttention(Layer): + def __init__(self, **kwargs): + super(SelfAttention, self).__init__(**kwargs) + + def build(self, input_shape): + self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), + initializer='random_normal', trainable=True) + super(SelfAttention, self).build(input_shape) + + def call(self, x): + # Alignment scores. Pass them through tanh function + e = K.tanh(K.dot(x, self.W)) + # Remove dimension of size 1 + e = K.squeeze(e, axis=-1) + # Compute the weights + alpha = K.softmax(e) + # Reshape to tensor of same shape as x for multiplication + alpha = K.expand_dims(alpha, axis=-1) + # Compute the context vector + context = x * alpha + context = K.sum(context, axis=1) + return context, alpha + + def compute_output_shape(self, input_shape): + return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1]) class ScorePredictor: @@ -41,28 +65,30 @@ class ScorePredictor: inputs = Input(shape=(139,)) # First dense layer - x = Dense(128, activation='leaky_relu')(inputs) - x = Dropout(0.5)(x) + x = Dense(128, activation='elu')(inputs) + x = Dropout(0.2)(x) x = BatchNormalization()(x) # Additional dense layers for units in [64,32]: - x = Dense(units, activation='leaky_relu')(x) - x = Dropout(0.3)(x) + x = Dense(units, activation='elu')(x) + x = Dropout(0.2)(x) x = BatchNormalization()(x) # Reshape for attention mechanism x = Reshape((32, 1))(x) # Attention mechanism - attention = Dense(32, activation='leaky_relu')(x) - attention = Dense(1, activation='softmax')(attention) + #attention = Dense(32, activation='elu')(x) + #attention = Dense(1, activation='softmax')(attention) # Apply attention - x = Multiply()([x, attention]) + #x = Multiply()([x, attention]) + x, _ = SelfAttention()(x) + # Global average pooling - x = GlobalAveragePooling1D()(x) + #x = GlobalAveragePooling1D()(x) # Output layer (for class probabilities) outputs = Dense(2, activation='softmax')(x) # Two neurons for class probabilities with softmax diff --git a/app/restart_json.py b/app/restart_json.py index 31adb6b..16263c9 100755 --- a/app/restart_json.py +++ b/app/restart_json.py @@ -15,6 +15,7 @@ import re import hashlib import glob from tqdm import tqdm +from utils.country_list import country_list from dotenv import load_dotenv import os diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/__pycache__/__init__.cpython-310.pyc b/app/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..249ad7678866d0949adbf41615379ad7b52a2f01 GIT binary patch literal 146 zcmd1j<>g`kf~51m(?IlN5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hienx(7s(x-! zVn$|erhairesXqRYGRRoQetv;YF>(dVnKm^X-Q^Iv3`7fW?p7Ve7s&k