bugfixing stocks screener

2024-10-01 12:31:52 +02:00 · 2024-10-01 12:31:52 +02:00 · 8872b5d1c7
commit 8872b5d1c7
parent 63cddb5caf
6 changed files with 69 additions and 32 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -38,11 +38,24 @@ def calculate_fdi(high, low, close, window=30):
    return (2 - n1) * 100


-def hurst_exponent(ts, max_lag=100):
-    lags = range(2, max_lag)
-    tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
-    poly = np.polyfit(np.log(lags), np.log(tau), 1)
-    return poly[0] * 2.0
+def find_top_correlated_features(df, target_column, exclude_columns, top_n=10):
+    # Ensure the target column is not in the exclude list
+    exclude_columns = [col for col in exclude_columns if col != target_column]
+    
+    # Select columns to consider for correlation
+    columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]]
+    
+    # Calculate the correlation matrix
+    correlation_matrix = df[columns_to_consider + [target_column]].corr()
+    
+    # Get correlations with the target column, excluding the target column itself
+    target_correlations = correlation_matrix[target_column].drop(target_column)
+    
+    # Sort by absolute correlation value and select top N
+    top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n)
+    
+    return top_correlated
+

 async def download_data(ticker, con, start_date, end_date):
    try:
@ -53,9 +66,9 @@ async def download_data(ticker, con, start_date, end_date):
            #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
            #f"json/financial-statements/income-statement/quarter/{ticker}.json",
            #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
-            #f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
-            #f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
-            #f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
+            f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
            #f"json/financial-statements/key-metrics/quarter/{ticker}.json",
            #f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
        ]
@ -90,26 +103,23 @@ async def download_data(ticker, con, start_date, end_date):

        balance = await load_json_from_file(statements[3])
        balance = await filter_data(balance, ignore_keys)
-
-        income_growth = await load_json_from_file(statements[4])
+        '''
+        income_growth = await load_json_from_file(statements[2])
        income_growth = await filter_data(income_growth, ignore_keys)

-        balance_growth = await load_json_from_file(statements[5])
+        balance_growth = await load_json_from_file(statements[3])
        balance_growth = await filter_data(balance_growth, ignore_keys)


-        cashflow_growth = await load_json_from_file(statements[6])
+        cashflow_growth = await load_json_from_file(statements[4])
        cashflow_growth = await filter_data(cashflow_growth, ignore_keys)

-        owner_earnings = await load_json_from_file(statements[7])
-        owner_earnings = await filter_data(owner_earnings, ignore_keys)
-        '''

        # Combine all the data
        combined_data = defaultdict(dict)

        # Merge the data based on 'date'
-        for entries in zip(ratios, key_metrics):
+        for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth):
            for entry in entries:
                date = entry['date']
                for key, value in entry.items():
@ -117,8 +127,6 @@ async def download_data(ticker, con, start_date, end_date):
                        combined_data[date][key] = value

        combined_data = list(combined_data.values())
-        #Generate more features
-        #combined_data = calculate_combinations(combined_data)

        # Download historical stock data using yfinance
        df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
@ -386,11 +394,13 @@ async def train_process(tickers, con):
    df_train = pd.concat(train_list, ignore_index=True)
    df_test = pd.concat(test_list, ignore_index=True)

-    
+
    best_features = [col for col in df_train.columns if col not in ['date','price','Target']]

    df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
-    print(df_train)
+    top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price'])
+    print(top_correlated)
+    #print(df_train)
    print('======Train Set Datapoints======')
    print(len(df_train))

@ -405,7 +415,7 @@ async def train_process(tickers, con):

 async def run():

-    train_mode = False
+    train_mode = True
    con = sqlite3.connect('stocks.db')
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
@ -413,7 +423,7 @@ async def run():
    if train_mode:
        #Train first model
        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'")
-        stock_symbols = [row[0] for row in cursor.fetchall()]
+        stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()]
        print('Number of Stocks')
        print(len(stock_symbols))
        await train_process(stock_symbols, con)
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -16,7 +16,7 @@ from sklearn.feature_selection import SelectKBest, f_classif
 from tensorflow.keras.backend import clear_session
 from keras import regularizers
 from keras.layers import Layer
-
+from tensorflow.keras import backend as K

 from tqdm import tqdm
 from collections import defaultdict
@ -26,7 +26,31 @@ import aiofiles
 import pickle
 import time

-# Based on the paper: https://arxiv.org/pdf/1603.00751
+class SelfAttention(Layer):
+    def __init__(self, **kwargs):
+        super(SelfAttention, self).__init__(**kwargs)
+    
+    def build(self, input_shape):
+        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1),
+                                 initializer='random_normal', trainable=True)
+        super(SelfAttention, self).build(input_shape)
+    
+    def call(self, x):
+        # Alignment scores. Pass them through tanh function
+        e = K.tanh(K.dot(x, self.W))
+        # Remove dimension of size 1
+        e = K.squeeze(e, axis=-1)   
+        # Compute the weights
+        alpha = K.softmax(e)
+        # Reshape to tensor of same shape as x for multiplication
+        alpha = K.expand_dims(alpha, axis=-1)
+        # Compute the context vector
+        context = x * alpha
+        context = K.sum(context, axis=1)
+        return context, alpha
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])


 class ScorePredictor:
@ -41,28 +65,30 @@ class ScorePredictor:
        inputs = Input(shape=(139,))
        
        # First dense layer
-        x = Dense(128, activation='leaky_relu')(inputs)
-        x = Dropout(0.5)(x)
+        x = Dense(128, activation='elu')(inputs)
+        x = Dropout(0.2)(x)
        x = BatchNormalization()(x)
        
        # Additional dense layers
        for units in [64,32]:
-            x = Dense(units, activation='leaky_relu')(x)
-            x = Dropout(0.3)(x)
+            x = Dense(units, activation='elu')(x)
+            x = Dropout(0.2)(x)
            x = BatchNormalization()(x)
        
        # Reshape for attention mechanism
        x = Reshape((32, 1))(x)
        
        # Attention mechanism
-        attention = Dense(32, activation='leaky_relu')(x)
-        attention = Dense(1, activation='softmax')(attention)
+        #attention = Dense(32, activation='elu')(x)
+        #attention = Dense(1, activation='softmax')(attention)
        
        # Apply attention
-        x = Multiply()([x, attention])
+        #x = Multiply()([x, attention])
        
+        x, _ = SelfAttention()(x)
+
        # Global average pooling
-        x = GlobalAveragePooling1D()(x)
+        #x = GlobalAveragePooling1D()(x)
        
        # Output layer (for class probabilities)
        outputs = Dense(2, activation='softmax')(x)  # Two neurons for class probabilities with softmax
--- a/app/restart_json.py
+++ b/app/restart_json.py
@ -15,6 +15,7 @@ import re
 import hashlib
 import glob
 from tqdm import tqdm
+from utils.country_list import country_list

 from dotenv import load_dotenv
 import os
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/pycache/init.cpython-310.pyc
+++ b/app/utils/pycache/init.cpython-310.pyc