reduce features

2024-10-06 00:26:22 +02:00 · 2024-10-06 00:26:22 +02:00 · 60cd644afa
commit 60cd644afa
parent 8521a4a404
5 changed files with 25 additions and 44 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):

            #Threshold of enough datapoints needed!
            if len(ratios) < 50:
+                print('Not enough data points')
                return


@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            # Merge the data based on 'date'
            for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
                for entry in entries:
-                    date = entry['date']
-                    for key, value in entry.items():
-                        if key not in combined_data[date]:
-                            combined_data[date][key] = value
+                    try:
+                        date = entry['date']
+                        for key, value in entry.items():
+                            if key not in combined_data[date]:
+                                combined_data[date][key] = value
+                    except:
+                        pass

            combined_data = list(combined_data.values())

@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            fundamental_columns = [
                'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
                'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
-                'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities',
-                'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased',
-                'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments',
-                'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments',
-                'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock',
-                'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments',
-                'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings',
+                'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
+                'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
            ]

            # Function to compute combinations within a group
@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            # Compute combinations for each group of columns
            compute_column_ratios(fundamental_columns, df_combined, new_columns)
            compute_column_ratios(stats_columns, df_combined, new_columns)
-            compute_column_ratios(ta_columns, df_combined, new_columns)
+            #compute_column_ratios(ta_columns, df_combined, new_columns)

            # Concatenate the new ratio columns with the original DataFrame
            df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            if not df_copy.empty:
                with open(file_path, 'wb') as file:
                    file.write(orjson.dumps(df_copy.to_dict(orient='records')))
-
+            print(df_copy)
            return df_copy

        except Exception as e:
@ -270,7 +269,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
        tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
        # Await the results for the current chunk
        chunk_results = await asyncio.gather(*tasks)
-
+        
        train_list = []

        for ticker, df in zip(chunk, chunk_results):
@ -324,23 +323,22 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
                data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])

                # Check if the evaluation data meets the criteria
-                '''
+                
                if (data['precision'] >= 50 and data['accuracy'] >= 50 and
                        data['accuracy'] < 100 and data['precision'] < 100 and
                        data['f1_score'] >= 50 and data['recall_score'] >= 50 and
                        data['roc_auc_score'] >= 50):
-                '''
-                # Save the evaluation data to a JSON file
-                await save_json(ticker, data)
-                print(f"Saved results for {ticker}")
+                    # Save the evaluation data to a JSON file
+                    await save_json(ticker, data)
+                    print(f"Saved results for {ticker}")
            except Exception as e:
                print(e)
                pass

-    
+
 async def warm_start_training(tickers, con, skip_downloading):
    
-    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
+    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)


 async def run():
@ -358,9 +356,8 @@ async def run():
            WHERE marketCap >= 500E6 
              AND symbol NOT LIKE '%.%' 
              AND symbol NOT LIKE '%-%' 
-            ORDER BY marketCap DESC;
        """)
-        warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
+        warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()]

        print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
        await warm_start_training(warm_start_symbols, con, skip_downloading)
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -23,13 +23,13 @@ class ScorePredictor:
        self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
        self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
        self.model = lgb.LGBMClassifier(
-            n_estimators=20_000,           # Number of boosting iterations - good balance between performance and training time
+            n_estimators=200,           # Number of boosting iterations - good balance between performance and training time
            learning_rate=0.005,         # Smaller learning rate for better generalization
-            max_depth=12,                 # Controlled depth to prevent overfitting
-            num_leaves=2**12,            # 2^max_depth, prevents overfitting while maintaining model complexity
+            max_depth=5,                 # Controlled depth to prevent overfitting
+            num_leaves=2**5-1,            # 2^max_depth, prevents overfitting while maintaining model complexity
            colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
            subsample=0.8,              # Use 80% of data per tree to reduce overfitting
-            min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
+            min_child_samples=5,       # Minimum samples per leaf to ensure reliable splits
            random_state=42,            # For reproducibility
            reg_alpha=0.1,             # L1 regularization
            reg_lambda=0.1,            # L2 regularization
--- a/app/utils/pycache/feature_engineering.cpython-310.pyc
+++ b/app/utils/pycache/feature_engineering.cpython-310.pyc
--- a/app/utils/feature_engineering.py
+++ b/app/utils/feature_engineering.py
@ -103,7 +103,7 @@ def generate_ta_features(df):
    df_features = df_features.dropna()
    return df_features

-def generate_statistical_features(df, windows=[20, 50], price_col='close', 
+def generate_statistical_features(df, windows=[50,200], price_col='close', 
                                high_col='high', low_col='low', volume_col='volume'):
    """
    Generate comprehensive statistical features for financial time series data.
@ -160,23 +160,7 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close',
        df_features[f'zscore_{window}'] = (
            (df[price_col] - df[price_col].rolling(window=window).mean()) / 
            df[price_col].rolling(window=window).std())
-        df_features[f'norm_price_{window}'] = (
-            df[price_col] / df[price_col].rolling(window=window).mean() - 1)
-        

-        # Correlation features
-        if volume_col in df.columns:
-            df_features[f'volume_price_corr_{window}'] = (
-                df[price_col].rolling(window=window).corr(df[volume_col]))
-        df_features[f'high_low_corr_{window}'] = (
-            df[high_col].rolling(window=window).corr(df[low_col]))
-        
-
-
-        # Quantile features
-        for q in [0.25, 0.75]:
-            df_features[f'price_q{int(q*100)}_{window}'] = (
-                df[price_col].rolling(window=window).quantile(q))
    
    # Price dynamics
    df_features['price_acceleration'] = df[price_col].diff().diff()