From 8872b5d1c7f9d3294c1eb8995a575025925e361e Mon Sep 17 00:00:00 2001
From: MuslemRahimi <moslem_rahimi@hotmail.de>
Date: Tue, 1 Oct 2024 12:31:52 +0200
Subject: [PATCH] bugfixing stocks screener

---
 app/cron_ai_score.py                          |  54 +++++++++++-------
 .../__pycache__/score_model.cpython-310.pyc   | Bin 5131 -> 6224 bytes
 app/ml_models/score_model.py                  |  46 +++++++++++----
 app/restart_json.py                           |   1 +
 app/utils/__init__.py                         |   0
 .../__pycache__/__init__.cpython-310.pyc      | Bin 0 -> 146 bytes
 6 files changed, 69 insertions(+), 32 deletions(-)
 create mode 100644 app/utils/__init__.py
 create mode 100644 app/utils/__pycache__/__init__.cpython-310.pyc

diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py
index 5a3705a..21b6d1e 100644
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@@ -38,11 +38,24 @@ def calculate_fdi(high, low, close, window=30):
     return (2 - n1) * 100
 
 
-def hurst_exponent(ts, max_lag=100):
-    lags = range(2, max_lag)
-    tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
-    poly = np.polyfit(np.log(lags), np.log(tau), 1)
-    return poly[0] * 2.0
+def find_top_correlated_features(df, target_column, exclude_columns, top_n=10):
+    # Ensure the target column is not in the exclude list
+    exclude_columns = [col for col in exclude_columns if col != target_column]
+    
+    # Select columns to consider for correlation
+    columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]]
+    
+    # Calculate the correlation matrix
+    correlation_matrix = df[columns_to_consider + [target_column]].corr()
+    
+    # Get correlations with the target column, excluding the target column itself
+    target_correlations = correlation_matrix[target_column].drop(target_column)
+    
+    # Sort by absolute correlation value and select top N
+    top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n)
+    
+    return top_correlated
+
 
 async def download_data(ticker, con, start_date, end_date):
     try:
@@ -53,9 +66,9 @@ async def download_data(ticker, con, start_date, end_date):
             #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
             #f"json/financial-statements/income-statement/quarter/{ticker}.json",
             #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
-            #f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
-            #f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
-            #f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
             #f"json/financial-statements/key-metrics/quarter/{ticker}.json",
             #f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
         ]
@@ -90,26 +103,23 @@ async def download_data(ticker, con, start_date, end_date):
 
         balance = await load_json_from_file(statements[3])
         balance = await filter_data(balance, ignore_keys)
-
-        income_growth = await load_json_from_file(statements[4])
+        '''
+        income_growth = await load_json_from_file(statements[2])
         income_growth = await filter_data(income_growth, ignore_keys)
 
-        balance_growth = await load_json_from_file(statements[5])
+        balance_growth = await load_json_from_file(statements[3])
         balance_growth = await filter_data(balance_growth, ignore_keys)
 
 
-        cashflow_growth = await load_json_from_file(statements[6])
+        cashflow_growth = await load_json_from_file(statements[4])
         cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
 
-        owner_earnings = await load_json_from_file(statements[7])
-        owner_earnings = await filter_data(owner_earnings, ignore_keys)
-        '''
 
         # Combine all the data
         combined_data = defaultdict(dict)
 
         # Merge the data based on 'date'
-        for entries in zip(ratios, key_metrics):
+        for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth):
             for entry in entries:
                 date = entry['date']
                 for key, value in entry.items():
@@ -117,8 +127,6 @@ async def download_data(ticker, con, start_date, end_date):
                         combined_data[date][key] = value
 
         combined_data = list(combined_data.values())
-        #Generate more features
-        #combined_data = calculate_combinations(combined_data)
 
         # Download historical stock data using yfinance
         df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
@@ -386,11 +394,13 @@ async def train_process(tickers, con):
     df_train = pd.concat(train_list, ignore_index=True)
     df_test = pd.concat(test_list, ignore_index=True)
 
-    
+
     best_features = [col for col in df_train.columns if col not in ['date','price','Target']]
 
     df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
-    print(df_train)
+    top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price'])
+    print(top_correlated)
+    #print(df_train)
     print('======Train Set Datapoints======')
     print(len(df_train))
 
@@ -405,7 +415,7 @@ async def train_process(tickers, con):
 
 async def run():
 
-    train_mode = False
+    train_mode = True
     con = sqlite3.connect('stocks.db')
     cursor = con.cursor()
     cursor.execute("PRAGMA journal_mode = wal")
@@ -413,7 +423,7 @@ async def run():
     if train_mode:
         #Train first model
         cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'")
-        stock_symbols = [row[0] for row in cursor.fetchall()]
+        stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()]
         print('Number of Stocks')
         print(len(stock_symbols))
         await train_process(stock_symbols, con)
diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc
index 5fd2d46577b3643814e87d05cdaeea5f128b2944..465580422ecf21e4609a321a035bc3d0daaba02f 100644
GIT binary patch
delta 2429
zcmZuzOKcQJ5bf@n-5u|G*Xs{95JC(QpW$O!fe>RGn@<Ev2o3>DkcBWBZ_nCe&+eF>
zS+K3a8_9@#EL)1^LO4Vw$^}I^5H94SNI53TAxb#4azM%f`IacBC`hELZ38B0tKU?2
zS65fPt}gHF_Mba<`q^xnK-+uwpAvhXkl%6ebkkvQ5PI<8?VVexyRYyHC7kdkudCcu
z=u&dCx4+z7=vFfIdI~+TPQBhjulgP+98llBLZABf7y6G8-on>hC47yucSb85gmEix
zgK^soD;(nVG%?rS`Go#Kjnt4`9CIo>p4E2M(5;{gz##?7NrAmbN|aO1t{MeR$r{%|
z)_H;_uaZK7r?>&%<a<OXg%nt=liAabS2`4iP9=2xN|B<?<;LKP(Pp3r>j6R%(umBk
zQO2oDqGR+kI!`f5u*3##5mS?ipgQgdu^qIfVOeg)4K2%LQV-x5nH;-pi{aoFm35X?
z^z0z8EMdme`k|WG30pSeNqFpfI@P+$v5pm6MO<EKC?}z7(1T8Zh>QZ&5m2y>aIM6+
z3VlwGh($vdYUurf9?_dCZl?xh-iGLFDtzm*;|`C6i;ti!A{8<#><ahGR>c=(+Z!}>
zsaNc>196Vn<BrT=fUfPiQ;v{zp|IVGJ>)rNLZ)rbm(59fM(hA%OROY@s;<XxF|iBn
zgMsmCXazMiCpu8eM8IG~55OkqfdP=GIyD}(u&L(N#3*yCSV@33vT_8so&^ZW0)@^Z
zxV=&BCc8<mu#k-E5!@qZa|}i`vXF>yL4$>4q%ALKutjSjS9F+7MLO4%x*q8fkPEl2
z4~UoG0Fz4Ho^S)P2M30v$E6<Hl@Y0NKNQ_CDO174s^d&KAo!)QY){CHGcgY2THGxM
zvZe*qvYA*Sg-%Ec6x-hTh%Hk^zY;nV;Sw*8$&q@|_PniNI>-UEupCQ6H&~7ubgF4J
zN2t1rqLtKE6t%<t8srRJN0u_D1WYZDv~{G!Rzk&$Q1PlbmmKQFIpx(8*IJ5x8E)0F
z{A#Ff7cX}&D5~6Ur&F7sanCO||3}|6;=aZ$gZL~|B8b;fIu0PS%a5n%cpg3=Tu`9$
zN2LxvK&W0Mz+z^QdRng;iF{6_VDY2Ib@8{_-Svv?r?#t|c;No{N4=kZ7pD`0G#Oio
zwwf_RBQ}G1lbj(aorkP}v|zlBr+H$IsntZ7f`m!%4CGWYG<ZGF@*K#iC^=derFa9+
z4^!Sar%@6mA$^(#Ou9G=2bxsugU{l-08(?jDr8jJE{5(U8_UvzZ8tu~_CvqP77wCg
zJAlc=Iryl{Si~C@N8(a)c2inx#z7vT1)&w84WS8Pd)$`Vc;+1xj{?X<6&`UQsU<Gp
z+#o^`;R&&dnbR<L0D904kfdoXuhr8uHMBeheu3d!T|OCKPX+X?xZW`PPJx=tmY@Pc
zVW<QpC?PA)o}PWixNH~~&baW44%9?$hxS+;7(tJEPfVtSqw0+!N%&y*+3opd4GT`O
zydW%#cvIbdx+iAoqjWNUE&XEm1K0-be*Yro#PGvgfBte}=w!dCi%A&EWC{58g^1Lk
z>{cvKxC_@P5$EDh(}&kwgn5A}B{m>nsTPCr-|06utfon3R2j6&K8Jg}!!xI7CPZb7
zMNG%v*I%I1aeH=S)9@bEe(e`;!;+a6wfJOqTif}^%2-+iJ~~qqL-BleZALlq0!|Oc
zx3l;5T?Y~B?g&Qjdt4?i*<RIomYke}(>6Rv<<w04O|C%a<JN}Xs2Ts&u)9@7fOjm@
zy`CF{_<sAoj`!t{ths@QVQz|c1T4qmT0EcM@#_01et_@=fNTsL&nbouw@Qv3R>7ks
zH@-v-WxH{C!CKOUyuK4Z%=gyjK?9<H0~Jpo3?Se;6qs^C1s$L?hwwSVR|rpte;j}~
z2@Cj>Ajkt4#Gr<5bQmc^Gjh33wQ3%g#$*ZpQB{hLWRs=%r^c75dxtfR&`z0$9|D0d
zN}hkYW6TkDAhzI1{RmiWWv1wR7$<lih;8w7bH{G1HOlh;IV6o<Aie52eRz>USA4H|
Sdo!(P^-!i){_?EpxqkttJx7}W

delta 1360
zcmZuwO>7%Q6y8~{?e*I6U+koa2GXLY-lhqSN`CFcv_({?Dy2U{)J0t_p0%@N?G3Zz
zDpZbw6yOs1!bnKr)I$&L1qnr5Dsk$m5=f{Z5UUDRLZbMV_Q0V+g7?;G5D{;+-@JK0
zGw*M{KekcGxP5(^1XO(bw6?uZl716Q>T{&|)#wO0J6V||5|wDynQBZ|rURZhGnE;T
z5ofkC8?1AcxnM0;iosf{luk>upXNT6Xl`Au%u{kv((_w)$TksQhjg)~exvz{XF0Xo
z5*rgYl@oDaH4M9HbHg|w-j0>YAL8@ar8RY(wB>cODsyFB>Tw?o@i2|;%4Moig@!wF
zpjEg`<1`WM74TDd1pV4dy?09V*(qSxw_}HzFjw289j3{Syc*##nxbi%>BwuV(RSp0
ztsSLV+FvI$*9j4+9l>PY)Jb;(s-_b^>@AwhR`(;=`I=*yOCK7{a$2C#%qq99ncQ}p
zk3b9bu&m47SAppc13Wq&&<`MF(<meW0mo*6^R=kw)_B8Q>B{1kwy_px2hkvnkV6<i
z7(~b-WDuZ*5=$XG58y|uZe!VYEL~+sQCLDDS@FXysJ7?FOwO$)M#$z-J&JG=VQd%1
zkvk$uQRxtH-iY`zKB%QbX_5?Sp>#;xi+iLX<`X*6MLjVz{9jK$walz#=2k7wGpNbU
z^WtX0D`JAW>_?dul#fl|CFc<KxTkk4jGD^~$K#E6#D%_xWL6wczD_QS>&cg<*d+LM
zaf<7I-F|%k?0avOb%kAqy&tJT4_wCH4K@wCX*kTj@hO2ai=UGx^XEa%h7k56ynrw&
z##86^?+GK$nF-zvms-x1t<BU062FATm<O8_f2OZaU=;Q5zD0Uau79zK^h>#33+SgZ
zn?Zd~e6Or)?5g-Ab9k=!?3C``KnzDf53v)l_%Wzl)%Kt{Fl1GiS$doW0l$boqvH3>
zk5emwAz#EQxeoQi*G#8n?JZ)`Lb1uMUIMn*&Q?f6oaui+#zY}EHV|ZsjrO&YV|yH@
zX5ofd%ALwz$IF8D3?p{{!4h|KM_#KVyNJ*M@H6nDttz*uQL{|mf{1!ye1;am*_j=M
z(X)iaHpN7~6ls8ku8AknLswo#V;DB6m+3|190YuAY!l(%&g@3b4ufJCeh(jqDyc+O
z)T3%t4XMfGu|TUYh+p%k$QxS+2Np?=;VAp@s_S6L5QxV{#GS$M0v_~#H~4A^df#%a
V1?-UbipcLPq(TV-H|>1c@IUmcUTXjV

diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py
index 2d35fcc..b2f47e7 100644
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@@ -16,7 +16,7 @@ from sklearn.feature_selection import SelectKBest, f_classif
 from tensorflow.keras.backend import clear_session
 from keras import regularizers
 from keras.layers import Layer
-
+from tensorflow.keras import backend as K
 
 from tqdm import tqdm
 from collections import defaultdict
@@ -26,7 +26,31 @@ import aiofiles
 import pickle
 import time
 
-# Based on the paper: https://arxiv.org/pdf/1603.00751
+class SelfAttention(Layer):
+    def __init__(self, **kwargs):
+        super(SelfAttention, self).__init__(**kwargs)
+    
+    def build(self, input_shape):
+        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
+                                 initializer='random_normal', trainable=True)
+        super(SelfAttention, self).build(input_shape)
+    
+    def call(self, x):
+        # Alignment scores. Pass them through tanh function
+        e = K.tanh(K.dot(x, self.W))
+        # Remove dimension of size 1
+        e = K.squeeze(e, axis=-1)   
+        # Compute the weights
+        alpha = K.softmax(e)
+        # Reshape to tensor of same shape as x for multiplication
+        alpha = K.expand_dims(alpha, axis=-1)
+        # Compute the context vector
+        context = x * alpha
+        context = K.sum(context, axis=1)
+        return context, alpha
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])
 
 
 class ScorePredictor:
@@ -41,28 +65,30 @@ class ScorePredictor:
         inputs = Input(shape=(139,))
         
         # First dense layer
-        x = Dense(128, activation='leaky_relu')(inputs)
-        x = Dropout(0.5)(x)
+        x = Dense(128, activation='elu')(inputs)
+        x = Dropout(0.2)(x)
         x = BatchNormalization()(x)
         
         # Additional dense layers
         for units in [64,32]:
-            x = Dense(units, activation='leaky_relu')(x)
-            x = Dropout(0.3)(x)
+            x = Dense(units, activation='elu')(x)
+            x = Dropout(0.2)(x)
             x = BatchNormalization()(x)
         
         # Reshape for attention mechanism
         x = Reshape((32, 1))(x)
         
         # Attention mechanism
-        attention = Dense(32, activation='leaky_relu')(x)
-        attention = Dense(1, activation='softmax')(attention)
+        #attention = Dense(32, activation='elu')(x)
+        #attention = Dense(1, activation='softmax')(attention)
         
         # Apply attention
-        x = Multiply()([x, attention])
+        #x = Multiply()([x, attention])
         
+        x, _ = SelfAttention()(x)
+
         # Global average pooling
-        x = GlobalAveragePooling1D()(x)
+        #x = GlobalAveragePooling1D()(x)
         
         # Output layer (for class probabilities)
         outputs = Dense(2, activation='softmax')(x)  # Two neurons for class probabilities with softmax
diff --git a/app/restart_json.py b/app/restart_json.py
index 31adb6b..16263c9 100755
--- a/app/restart_json.py
+++ b/app/restart_json.py
@@ -15,6 +15,7 @@ import re
 import hashlib
 import glob
 from tqdm import tqdm
+from utils.country_list import country_list
 
 from dotenv import load_dotenv
 import os
diff --git a/app/utils/__init__.py b/app/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/utils/__pycache__/__init__.cpython-310.pyc b/app/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..249ad7678866d0949adbf41615379ad7b52a2f01
GIT binary patch
literal 146
zcmd1j<>g`kf~51m(?IlN5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hienx(7s(x-!
zVn$|erhairesXqRYGRRoQetv;YF>(dVnKm^X-Q^Iv3`7fW?p7Ve7s&k<t+}I-29Z%
PoK!oIam7qPf`tJ9`P?C)

literal 0
HcmV?d00001