reduce features

2024-10-06 00:26:22 +02:00 · 2024-10-06 00:26:22 +02:00 · 60cd644afa
commit 60cd644afa
parent 8521a4a404
5 changed files with 25 additions and 44 deletions
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            #Threshold of enough datapoints needed!
            if len(ratios) < 50:
                print('Not enough data points')
                return
@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            # Merge the data based on 'date'
            for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
                for entry in entries:
-                    date = entry['date']
+                    try:
-                    for key, value in entry.items():
+                        date = entry['date']
-                        if key not in combined_data[date]:
+                        for key, value in entry.items():
-                            combined_data[date][key] = value
+                            if key not in combined_data[date]:
                                combined_data[date][key] = value
                    except:
                        pass
            combined_data = list(combined_data.values())
@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            fundamental_columns = [
                'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
                'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
-                'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities',
+                'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
-                'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased',
+                'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
                'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments',
                'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments',
                'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock',
                'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments',
                'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings',
            ]
            # Function to compute combinations within a group
@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            # Compute combinations for each group of columns
            compute_column_ratios(fundamental_columns, df_combined, new_columns)
            compute_column_ratios(stats_columns, df_combined, new_columns)
-            compute_column_ratios(ta_columns, df_combined, new_columns)
+            #compute_column_ratios(ta_columns, df_combined, new_columns)
            # Concatenate the new ratio columns with the original DataFrame
            df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
            if not df_copy.empty:
                with open(file_path, 'wb') as file:
                    file.write(orjson.dumps(df_copy.to_dict(orient='records')))
-
+            print(df_copy)
            return df_copy
        except Exception as e:
@ -324,15 +323,14 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
                data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
                # Check if the evaluation data meets the criteria
-                '''
+                
                if (data['precision'] >= 50 and data['accuracy'] >= 50 and
                        data['accuracy'] < 100 and data['precision'] < 100 and
                        data['f1_score'] >= 50 and data['recall_score'] >= 50 and
                        data['roc_auc_score'] >= 50):
-                '''
+                    # Save the evaluation data to a JSON file
-                # Save the evaluation data to a JSON file
+                    await save_json(ticker, data)
-                await save_json(ticker, data)
+                    print(f"Saved results for {ticker}")
                print(f"Saved results for {ticker}")
            except Exception as e:
                print(e)
                pass
@ -340,7 +338,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
 async def warm_start_training(tickers, con, skip_downloading):
-    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
+    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
 async def run():
@ -358,9 +356,8 @@ async def run():
            WHERE marketCap >= 500E6 
              AND symbol NOT LIKE '%.%' 
              AND symbol NOT LIKE '%-%' 
            ORDER BY marketCap DESC;
        """)
-        warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
+        warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()]
        print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
        await warm_start_training(warm_start_symbols, con, skip_downloading)
--- a/app/ml_models/pycache/score_model.cpython-310.pyc
+++ b/app/ml_models/pycache/score_model.cpython-310.pyc
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@ -23,13 +23,13 @@ class ScorePredictor:
        self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
        self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
        self.model = lgb.LGBMClassifier(
-            n_estimators=20_000,           # Number of boosting iterations - good balance between performance and training time
+            n_estimators=200,           # Number of boosting iterations - good balance between performance and training time
            learning_rate=0.005,         # Smaller learning rate for better generalization
-            max_depth=12,                 # Controlled depth to prevent overfitting
+            max_depth=5,                 # Controlled depth to prevent overfitting
-            num_leaves=2**12,            # 2^max_depth, prevents overfitting while maintaining model complexity
+            num_leaves=2**5-1,            # 2^max_depth, prevents overfitting while maintaining model complexity
            colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
            subsample=0.8,              # Use 80% of data per tree to reduce overfitting
-            min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
+            min_child_samples=5,       # Minimum samples per leaf to ensure reliable splits
            random_state=42,            # For reproducibility
            reg_alpha=0.1,             # L1 regularization
            reg_lambda=0.1,            # L2 regularization
--- a/app/utils/pycache/feature_engineering.cpython-310.pyc
+++ b/app/utils/pycache/feature_engineering.cpython-310.pyc
--- a/app/utils/feature_engineering.py
+++ b/app/utils/feature_engineering.py
@ -103,7 +103,7 @@ def generate_ta_features(df):
    df_features = df_features.dropna()
    return df_features
-def generate_statistical_features(df, windows=[20, 50], price_col='close', 
+def generate_statistical_features(df, windows=[50,200], price_col='close', 
                                high_col='high', low_col='low', volume_col='volume'):
    """
    Generate comprehensive statistical features for financial time series data.
@ -160,24 +160,8 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close',
        df_features[f'zscore_{window}'] = (
            (df[price_col] - df[price_col].rolling(window=window).mean()) / 
            df[price_col].rolling(window=window).std())
        df_features[f'norm_price_{window}'] = (
            df[price_col] / df[price_col].rolling(window=window).mean() - 1)
        # Correlation features
        if volume_col in df.columns:
            df_features[f'volume_price_corr_{window}'] = (
                df[price_col].rolling(window=window).corr(df[volume_col]))
        df_features[f'high_low_corr_{window}'] = (
            df[high_col].rolling(window=window).corr(df[low_col]))
        # Quantile features
        for q in [0.25, 0.75]:
            df_features[f'price_q{int(q*100)}_{window}'] = (
                df[price_col].rolling(window=window).quantile(q))
    # Price dynamics
    df_features['price_acceleration'] = df[price_col].diff().diff()
    df_features['momentum_change'] = df[price_col].pct_change().diff()