add warm startu to model

2024-10-01 13:49:14 +02:00 · 2024-10-01 13:49:14 +02:00 · d0b5cd5aaa
commit d0b5cd5aaa
parent 8872b5d1c7
3 changed files with 115 additions and 193 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -38,23 +38,6 @@ def calculate_fdi(high, low, close, window=30):
    return (2 - n1) * 100


-def find_top_correlated_features(df, target_column, exclude_columns, top_n=10):
-    # Ensure the target column is not in the exclude list
-    exclude_columns = [col for col in exclude_columns if col != target_column]
-    
-    # Select columns to consider for correlation
-    columns_to_consider = [col for col in df.columns if col not in exclude_columns + [target_column]]
-    
-    # Calculate the correlation matrix
-    correlation_matrix = df[columns_to_consider + [target_column]].corr()
-    
-    # Get correlations with the target column, excluding the target column itself
-    target_correlations = correlation_matrix[target_column].drop(target_column)
-    
-    # Sort by absolute correlation value and select top N
-    top_correlated = target_correlations.abs().sort_values(ascending=False).head(top_n)
-    
-    return top_correlated


 async def download_data(ticker, con, start_date, end_date):
@ -63,14 +46,13 @@ async def download_data(ticker, con, start_date, end_date):
        statements = [
            f"json/financial-statements/ratios/quarter/{ticker}.json",
            f"json/financial-statements/key-metrics/quarter/{ticker}.json",
-            #f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
-            #f"json/financial-statements/income-statement/quarter/{ticker}.json",
-            #f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
+            f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
+            f"json/financial-statements/income-statement/quarter/{ticker}.json",
+            f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
            f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
            f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
            f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
-            #f"json/financial-statements/key-metrics/quarter/{ticker}.json",
-            #f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
+            f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
        ]

        # Helper function to load JSON data asynchronously
@ -91,35 +73,42 @@ async def download_data(ticker, con, start_date, end_date):
        ratios = await load_json_from_file(statements[0])
        ratios = await filter_data(ratios, ignore_keys)

+        #Threshold of enough datapoints needed!
+        if len(ratios) < 50:
+            return
+
        key_metrics = await load_json_from_file(statements[1])
        key_metrics = await filter_data(key_metrics, ignore_keys)
        
-        '''
-        cashflow = await load_json_from_file(statements[1])
+        
+        cashflow = await load_json_from_file(statements[2])
        cashflow = await filter_data(cashflow, ignore_keys)

-        income = await load_json_from_file(statements[2])
+        income = await load_json_from_file(statements[3])
        income = await filter_data(income, ignore_keys)

-        balance = await load_json_from_file(statements[3])
+        balance = await load_json_from_file(statements[4])
        balance = await filter_data(balance, ignore_keys)
-        '''
-        income_growth = await load_json_from_file(statements[2])
+        
+        income_growth = await load_json_from_file(statements[5])
        income_growth = await filter_data(income_growth, ignore_keys)

-        balance_growth = await load_json_from_file(statements[3])
+        balance_growth = await load_json_from_file(statements[6])
        balance_growth = await filter_data(balance_growth, ignore_keys)


-        cashflow_growth = await load_json_from_file(statements[4])
+        cashflow_growth = await load_json_from_file(statements[7])
        cashflow_growth = await filter_data(cashflow_growth, ignore_keys)

+        owner_earnings = await load_json_from_file(statements[8])
+        owner_earnings = await filter_data(owner_earnings, ignore_keys)
+

        # Combine all the data
        combined_data = defaultdict(dict)

        # Merge the data based on 'date'
-        for entries in zip(ratios, key_metrics, income_growth, balance_growth, cashflow_growth):
+        for entries in zip(ratios, key_metrics, cashflow, income, balance, income_growth, balance_growth, cashflow_growth, owner_earnings):
            for entry in entries:
                date = entry['date']
                for key, value in entry.items():
@ -321,29 +310,6 @@ async def download_data(ticker, con, start_date, end_date):
        pass


-async def process_symbol(ticker, con, start_date, end_date):
-    try:
-        test_size = 0.2
-        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
-        end_date = datetime.today().strftime("%Y-%m-%d")
-        predictor = ScorePredictor()
-        df = await download_data(ticker, con, start_date, end_date)
-        split_size = int(len(df) * (1-test_size))
-        test_data = df.iloc[split_size:]
-        selected_features = [col for col in df.columns if col not in ['date','price','Target']]
-
-        print(f"For the Ticker: {ticker}")
-        data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
-
-        if len(data) != 0:
-            if data['precision'] >= 50 and data['accuracy'] >= 50:
-                res = {'score': data['score']}
-                await save_json(ticker, res)
-    
-    except Exception as e:
-        print(e)
-
-
 async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
    # Helper function to divide the tickers into chunks
    def chunks(lst, size):
@ -362,97 +328,84 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
    
    return results

-#Train mode
-async def train_process(tickers, con):
-    tickers = list(set(tickers))
-    df_train = pd.DataFrame()
-    df_test = pd.DataFrame()
-    test_size = 0.2
+
+
+async def warm_start_training(tickers, con):
    start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
    end_date = datetime.today().strftime("%Y-%m-%d")
-    df_train = pd.DataFrame()
-    df_test = pd.DataFrame()
    
    dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
    
-    train_list = []
-    test_list = []
+    df_train = pd.concat(dfs, ignore_index=True)
+    df_train = df_train.sample(frac=1).reset_index(drop=True)
    
-    for df in dfs:
+    print('======Warm Start Train Set Datapoints======')
+    print(len(df_train))
+    
+    predictor = ScorePredictor()
+    selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
+    predictor.warm_start_training(df_train[selected_features], df_train['Target'])
+    
+    return predictor
+
+async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
    try:
+        df = await download_data(ticker, con, start_date, end_date)
+        if df is None or len(df) == 0:
+            print(f"No data available for {ticker}")
+            return
+        
+        test_size = 0.2
        split_size = int(len(df) * (1-test_size))
        train_data = df.iloc[:split_size]
        test_data = df.iloc[split_size:]
        
-            # Append to the lists
-            train_list.append(train_data)
-            test_list.append(test_data)
-        except:
-            pass
-
-    # Concatenate all at once outside the loop
-    df_train = pd.concat(train_list, ignore_index=True)
-    df_test = pd.concat(test_list, ignore_index=True)
-
-
-    best_features = [col for col in df_train.columns if col not in ['date','price','Target']]
-
-    df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
-    top_correlated = find_top_correlated_features(df_train, 'Target', ['date', 'price'])
-    print(top_correlated)
-    #print(df_train)
-    print('======Train Set Datapoints======')
-    print(len(df_train))
-
+        selected_features = [col for col in df.columns if col not in ['date','price','Target']]
+        # Fine-tune the model
        predictor = ScorePredictor()
-    #print(selected_features)
-    selected_features = [col for col in df_train if col not in ['price','date','Target']]
-    #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=15)
-    #print(best_features)
-    predictor.train_model(df_train[selected_features], df_train['Target'])
-    predictor.evaluate_model(df_test[selected_features], df_test['Target'])
+        predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
        
+        print(f"Evaluating fine-tuned model for {ticker}")
+        data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
+        
+        if len(data) != 0:
+            if data['precision'] >= 50 and data['accuracy'] >= 50:
+                res = {'score': data['score']}
+                await save_json(ticker, res)
+                print(f"Saved results for {ticker}")
+    
+    except Exception as e:
+        print(f"Error processing {ticker}: {e}")

 async def run():
-
-    train_mode = True
+    train_mode = True  # Set this to False for fine-tuning and evaluation
    con = sqlite3.connect('stocks.db')
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
    
    if train_mode:
-        #Train first model
-        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 300E9 AND symbol NOT LIKE '%.%'")
-        stock_symbols = ['AAPL','AWR','TSLA','MSFT'] #[row[0] for row in cursor.fetchall()]
-        print('Number of Stocks')
-        print(len(stock_symbols))
-        await train_process(stock_symbols, con)
-
-
-    #Prediction Steps for all stock symbols
-    if not train_mode:
+        # Warm start training
+        warm_start_symbols = ['META', 'NFLX','GOOG','TSLA','AWR','AMD','NVDA']
+        print('Warm Start Training for:', warm_start_symbols)
+        predictor = await warm_start_training(warm_start_symbols, con)
+    else:
+        # Fine-tuning and evaluation for all stocks
        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
-        stock_symbols = [row[0] for row in cursor.fetchall()]
-        total_symbols = stock_symbols
+        stock_symbols = ['NVDA'] #[row[0] for row in cursor.fetchall()]
        
-        print(f"Total tickers: {len(total_symbols)}")
+        print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
        start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
        end_date = datetime.today().strftime("%Y-%m-%d")
-
-        chunk_size = len(total_symbols) // 100  # Divide the list into N chunks
-        chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
-        for chunk in chunks:
        tasks = []
-            for ticker in tqdm(chunk):
-                tasks.append(process_symbol(ticker, con, start_date, end_date))
+        for ticker in tqdm(stock_symbols):
+            tasks.append(fine_tune_and_evaluate(ticker, con, start_date, end_date))
        
        await asyncio.gather(*tasks)
    
-
    con.close()

+if __name__ == "__main__":
    try:
        asyncio.run(run())
    except Exception as e:
-    print(e)
-
+        print(f"Main execution error: {e}")
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -56,96 +56,72 @@ class SelfAttention(Layer):
 class ScorePredictor:
    def __init__(self):
        self.scaler = MinMaxScaler()
-        self.model = self.build_model()
+        self.model = None
+        self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.keras'

    def build_model(self):
        clear_session()
        
-        # Input layer
-        inputs = Input(shape=(139,))
-        
-        # First dense layer
-        x = Dense(128, activation='elu')(inputs)
+        inputs = Input(shape=(335,))
+        x = Dense(512, activation='elu')(inputs)
        x = Dropout(0.2)(x)
        x = BatchNormalization()(x)
        
-        # Additional dense layers
        for units in [64, 32]:
            x = Dense(units, activation='elu')(x)
            x = Dropout(0.2)(x)
            x = BatchNormalization()(x)
        
-        # Reshape for attention mechanism
        x = Reshape((32, 1))(x)
-        
-        # Attention mechanism
-        #attention = Dense(32, activation='elu')(x)
-        #attention = Dense(1, activation='softmax')(attention)
-        
-        # Apply attention
-        #x = Multiply()([x, attention])
-        
        x, _ = SelfAttention()(x)
+        outputs = Dense(2, activation='softmax')(x)
        
-        # Global average pooling
-        #x = GlobalAveragePooling1D()(x)
-        
-        # Output layer (for class probabilities)
-        outputs = Dense(2, activation='softmax')(x)  # Two neurons for class probabilities with softmax
-        
-        # Create the model
        model = Model(inputs=inputs, outputs=outputs)
-        
-        # Optimizer with a lower learning rate
        optimizer = Adam(learning_rate=0.01, clipnorm=1.0)
-        
-        # Compile the model
        model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        
        return model

    def preprocess_data(self, X):
-        # X = X.applymap(lambda x: 9999 if x == 0 else x)  # Replace 0 with 9999 as suggested in the paper
        X = np.where(np.isinf(X), np.nan, X)
        X = np.nan_to_num(X)
        X = self.scaler.fit_transform(X)
        return X

-    def reshape_for_lstm(self, X):
-        return X.reshape((X.shape[0], X.shape[1], 1))
-
-    def train_model(self, X_train, y_train):
+    def warm_start_training(self, X_train, y_train):
        X_train = self.preprocess_data(X_train)
-        #X_train = self.reshape_for_lstm(X_train)
+        self.model = self.build_model()
        
-        checkpoint = ModelCheckpoint('ml_models/weights/ai-score/weights.keras', 
-                                      save_best_only=True, save_freq = 1,
-                                      monitor='val_loss', mode='min')
+        checkpoint = ModelCheckpoint(self.warm_start_model_path, save_best_only=True, save_freq=1, monitor='val_loss', mode='min')
        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.001)

-        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, 
-                       validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
-        self.model.save('ml_models/weights/ai-score/weights.keras')
+        self.model.fit(X_train, y_train, epochs=100_000, batch_size=32, validation_split=0.1, callbacks=[checkpoint, early_stopping, reduce_lr])
+        self.model.save(self.warm_start_model_path)
+        print("Warm start model saved.")
+
+    def fine_tune_model(self, X_train, y_train):
+        X_train = self.preprocess_data(X_train)
+        
+        if self.model is None:
+            self.model = load_model(self.warm_start_model_path, custom_objects={'SelfAttention': SelfAttention})
+        
+        early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
+        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001)
+
+        self.model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping, reduce_lr])
+        print("Model fine-tuned (not saved).")

    def evaluate_model(self, X_test, y_test):
-        # Preprocess the test data
        X_test = self.preprocess_data(X_test)
-        #X_test = self.reshape_for_lstm(X_test)
        
-        # Load the trained model
-        self.model = load_model('ml_models/weights/ai-score/weights.keras')
+        if self.model is None:
+            raise ValueError("Model has not been trained or fine-tuned. Call warm_start_training or fine_tune_model first.")
        
-        # Get the model's predictions
        test_predictions = self.model.predict(X_test)
-        print(test_predictions)
-
-        # Extract the probabilities for class 1 (index 1 in the softmax output)
        class_1_probabilities = test_predictions[:, 1]
-        # Convert probabilities to binary predictions using a threshold of 0.5
        binary_predictions = (class_1_probabilities >= 0.5).astype(int)
-        
-        # Calculate precision and accuracy using binary predictions
+        print(test_predictions)
        test_precision = precision_score(y_test, binary_predictions)
        test_accuracy = accuracy_score(y_test, binary_predictions)
        
@ -153,32 +129,25 @@ class ScorePredictor:
        print(f"Precision: {round(test_precision * 100)}%")
        print(f"Accuracy: {round(test_accuracy * 100)}%")
        
-        # Define thresholds and corresponding scores
        thresholds = [0.8, 0.75, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.2]
        scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

-        # Get the last prediction value (class 1 probability) for scoring
        last_prediction_prob = class_1_probabilities[-1]
-
-        # Initialize score to 0 (or any default value)
        score = 0
-        print(last_prediction_prob)
-        # Determine the score based on the last prediction probability
+        print(f"Last prediction probability: {last_prediction_prob}")
+        
        for threshold, value in zip(thresholds, scores):
            if last_prediction_prob >= threshold:
                score = value
-                break  # Exit the loop once the score is determined
+                break

-        # Return the evaluation results
        return {'accuracy': round(test_accuracy * 100), 
                'precision': round(test_precision * 100), 
                'score': score}

-
-
    def feature_selection(self, X_train, y_train, k=100):
-        print('feature selection:')
-        print(X_train.shape, y_train.shape)
+        print('Feature selection:')
+        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        selector = SelectKBest(score_func=f_classif, k=k)
        selector.fit(X_train, y_train)