From 60cd644afadb43d0d576f5bb88c0f53833d532de Mon Sep 17 00:00:00 2001
From: MuslemRahimi <moslem_rahimi@hotmail.de>
Date: Sun, 6 Oct 2024 00:26:22 +0200
Subject: [PATCH] reduce features

---
 app/cron_ai_score.py                          |  43 ++++++++----------
 .../__pycache__/score_model.cpython-310.pyc   | Bin 4004 -> 3999 bytes
 app/ml_models/score_model.py                  |   8 ++--
 .../feature_engineering.cpython-310.pyc       | Bin 6816 -> 6496 bytes
 app/utils/feature_engineering.py              |  18 +-------
 5 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py
index bbeaeae..563a355 100644
--- a/app/cron_ai_score.py
+++ b/app/cron_ai_score.py
@@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
 
             #Threshold of enough datapoints needed!
             if len(ratios) < 50:
+                print('Not enough data points')
                 return
 
 
@@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
             # Merge the data based on 'date'
             for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
                 for entry in entries:
-                    date = entry['date']
-                    for key, value in entry.items():
-                        if key not in combined_data[date]:
-                            combined_data[date][key] = value
+                    try:
+                        date = entry['date']
+                        for key, value in entry.items():
+                            if key not in combined_data[date]:
+                                combined_data[date][key] = value
+                    except:
+                        pass
 
             combined_data = list(combined_data.values())
 
@@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
             fundamental_columns = [
                 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
                 'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
-                'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities',
-                'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased',
-                'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments',
-                'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments',
-                'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock',
-                'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments',
-                'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings',
+                'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
+                'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
             ]
 
             # Function to compute combinations within a group
@@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
             # Compute combinations for each group of columns
             compute_column_ratios(fundamental_columns, df_combined, new_columns)
             compute_column_ratios(stats_columns, df_combined, new_columns)
-            compute_column_ratios(ta_columns, df_combined, new_columns)
+            #compute_column_ratios(ta_columns, df_combined, new_columns)
 
             # Concatenate the new ratio columns with the original DataFrame
             df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
@@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
             if not df_copy.empty:
                 with open(file_path, 'wb') as file:
                     file.write(orjson.dumps(df_copy.to_dict(orient='records')))
-
+            print(df_copy)
             return df_copy
 
         except Exception as e:
@@ -270,7 +269,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
         tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
         # Await the results for the current chunk
         chunk_results = await asyncio.gather(*tasks)
-
+        
         train_list = []
 
         for ticker, df in zip(chunk, chunk_results):
@@ -324,23 +323,22 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
                 data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
 
                 # Check if the evaluation data meets the criteria
-                '''
+                
                 if (data['precision'] >= 50 and data['accuracy'] >= 50 and
                         data['accuracy'] < 100 and data['precision'] < 100 and
                         data['f1_score'] >= 50 and data['recall_score'] >= 50 and
                         data['roc_auc_score'] >= 50):
-                '''
-                # Save the evaluation data to a JSON file
-                await save_json(ticker, data)
-                print(f"Saved results for {ticker}")
+                    # Save the evaluation data to a JSON file
+                    await save_json(ticker, data)
+                    print(f"Saved results for {ticker}")
             except Exception as e:
                 print(e)
                 pass
 
-    
+
 async def warm_start_training(tickers, con, skip_downloading):
     
-    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
+    dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
 
 
 async def run():
@@ -358,9 +356,8 @@ async def run():
             WHERE marketCap >= 500E6 
               AND symbol NOT LIKE '%.%' 
               AND symbol NOT LIKE '%-%' 
-            ORDER BY marketCap DESC;
         """)
-        warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
+        warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()]
 
         print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
         await warm_start_training(warm_start_symbols, con, skip_downloading)
diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc
index bfb68adbd9cd285da2a09a5da4f9d6dcdb86c545..9846f133fe714b30d3128b6280d2cd12db8692d6 100644
GIT binary patch
delta 640
zcmZ1?KVP0VpO=@50SIjOFs2{c$m_r)$(F*I!UcrfDLg5>y*xDx@$4xK!3>&wlhc@d
z6<?kJDo(E!S?B(+s>J>!E0Fh69!NZ&H4_A0+Hbze#LdX4x%nxx8>5~QQ05kIT4qUn
zNl{{6aaw*+ZjmxjY$a2X21trAqDT`+LkO+Ok*wK_I+KsGTHEP@MD;*~K8P>?5r!ZF
zY$7MBX+_2$8DxX(*yI?^CdaaUWHg=Z&n_mV0#tB|CBGmw?-pA@W^#5;>MfR>{KS+Z
z^~v?@5~^Ugut$I#nVENsy%Ne`OhYkr5Bt?XOQ1nj0>1evsW}R1nR%(YC8c?(DViKb
zrXVNSfe4UYMfM=V0Ys>S2uBd%3?f`W1d>xXKjJ80QWOS?rUNk$ys%GC1A}Mw>BeC2
z*uKbnaxu5Qo+wcIB`1jB01@mUf(=L%34&NGAc7f0FoDFpCSTwVXR`n?)hFxnSTZ_K
zPUlhN1ACPX=+)HXk|Mv!(|E3PBe`aB1@BTlZIB+Yf55suKwM7{Q2-(ks=+L+$=-af
kjJlI2^3^e_P8Q%dR|R?f7E4KHZfcP=NCO{MT{-*~0Qqf~7XSbN

delta 645
zcmZvX&r2IY6vt;a`@^`qi9)oILSvg6lh&%V7Nez!)HYf{K|JV97B)%ukj*kXf<`HL
zuh8;NdT1~8;H6M<=(VR_dMa3ifkN9$+5e*Rl0qXm3!i=See=HWy?2@S8M|rQ{Y0MM
zC$#?MqrISv#NB?^l%M6=ZsN!~AlrsZHb@}}HDxvDjjj<=-%ov5`qkp)$daWV86f1(
z$uXX&9HpexK37ai^X;Fp3Vm=>3c{{=e48`J4{J@<V7H`prm%5L(Jhvj|1Jus48dz{
z2z+gw-i6QFqtXPX?;$1;Q;2Cq$2xx5mfgopk0Z?KH}r>6Y-Qn4yAHehxAcs>3Z)qe
zHE{R~p;vdkkcTV_j*%|qFd35r3)S6e1VRg#=kqhQJFc;Xdw9Mu_WlL5cB$Pb<2x<F
zQD3@<UD#tp7EwZ!5!it(Af6x=5j_*PfAj?k9fUPALs!5zr)Ub^n%9OSO+JiNL=2%|
z-E;VAZtAlb<{)p)O{1mISaZjBg%Y?T{?y@4g*lC?h_$?+FxcHgmwQx0&Q{@#bz$~&
zhtNKqyn}w7`8&=_cwI&Wi2tuUQ2{E6XLJHyC0^3oU?ykBa8;pluTf?5Sdh4~s-7(V
E1qcV6<p2Nx

diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py
index e0bc768..dc45468 100644
--- a/app/ml_models/score_model.py
+++ b/app/ml_models/score_model.py
@@ -23,13 +23,13 @@ class ScorePredictor:
         self.pca = PCA(n_components=0.95)  # Retain components explaining 95% variance
         self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
         self.model = lgb.LGBMClassifier(
-            n_estimators=20_000,           # Number of boosting iterations - good balance between performance and training time
+            n_estimators=200,           # Number of boosting iterations - good balance between performance and training time
             learning_rate=0.005,         # Smaller learning rate for better generalization
-            max_depth=12,                 # Controlled depth to prevent overfitting
-            num_leaves=2**12,            # 2^max_depth, prevents overfitting while maintaining model complexity
+            max_depth=5,                 # Controlled depth to prevent overfitting
+            num_leaves=2**5-1,            # 2^max_depth, prevents overfitting while maintaining model complexity
             colsample_bytree=0.8,       # Use 80% of features per tree to reduce overfitting
             subsample=0.8,              # Use 80% of data per tree to reduce overfitting
-            min_child_samples=20,       # Minimum samples per leaf to ensure reliable splits
+            min_child_samples=5,       # Minimum samples per leaf to ensure reliable splits
             random_state=42,            # For reproducibility
             reg_alpha=0.1,             # L1 regularization
             reg_lambda=0.1,            # L2 regularization
diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc
index 842219bcf283b2cf07a29bb239b0f832477a6b62..1929f3dfab1edd4437c311cf4112f50e8df2612e 100644
GIT binary patch
delta 425
zcmZWlJxc>Y5S_VPHlO#EyX5W+7?6O(!a`9L(kWJgh>Zm8hJ<J#M#~EphEyuZ`UCa~
zdnMReYHul4{tP>vOF|kKW_I7qdvA9~t$nYUXNHjnUhA*eS+3uk-OVxy$_%hoqzdII
z&nJTlR2)FdB!kS3Mi$xRP^q0JmptxUga^>@pPx=`oR_{+ujSY=%dy3AG1LGeP!L03
zM7RydZ=)k;h#Ti3f=CNNnejLxK~|HQ;~LItb`vf&RiG3(yr-xnKiuk7$psH$WR1&x
zYoeNJb@E~i;2IgH*gu_MRg4pFeTIz~0;mEzl$+CAC;AiSlYWazB#i&HM6*DZL_|K@
z@l)9(U@Vk9{-oe>grghpN0VQEA02rZ>^nMj+wER~C(^s))X2%tU{5ZEtC?Mf*=kl&
WxTu@Io{*1W4PWGY_`ta}xAO-L_+yCx

delta 764
zcmZ8ezi-n(6n^KQN$fa3>?C#qO<(|JKu914P(%kR)(XK?sj^C5k|v}Hanp8jm4Sj-
zK!DjQB>O+0>dwvtE1nqn4`4ITv0G42x_9s0ecyNAyZd>|zoSj_`5e$Vc+Cc1uf5lX
zcMX(KK4KdxRxpb>%JQh7dITF9ikKg8s9^z%Sn7*dMxD+J$R0zxdh+>M(QB1jS7N~o
z%?C7h@ACjI_z=LM<gzt*$2Oa%cIg#cxZrXYoyBn^x-trHm@fs;0OU<@B`mn|F?`}W
z;EFDcw0&@XUqBbA7A*)Ep^N8eaQXifV~mhuN@o}>uPtD+ewe)=Sah>XF@0WgvuGIb
zf*k`^mNcP5=4Z-GN;rNwfOyi?)lS-m&(8O19hh^zf;^hE@@NIjpw{A?_+-$#p(_hZ
z2P3gxC%m30r}_dm0%nqf1n-M+?W=U-vZ1uj2VjyexwzCxT_pSw;m@9wo-=*se|s_K
zDqfCBxMD$UokW^cOTMxEEs1`$b^1g9!#z<ZQ)kD1?hLy<J1p-{$9p@ywCqkD#}2ja
z;b7YyPY>*6-R1`L`82+-rpd3>P)_(WN2i03dEcrFR`J$rk8aA{SaGr&3Nbs(%=We>
z^WnJXY{f_chyA{zQ0ly;(9U@$wRJvZvk&HN`6=yW8aeNtdA+ViD}oy3M3Yk^lUJjR
YWN<4H7a5}0Gn?#-_tN}Gv2H#53kDUyB>(^b

diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py
index 46f72ca..aa38e2a 100644
--- a/app/utils/feature_engineering.py
+++ b/app/utils/feature_engineering.py
@@ -103,7 +103,7 @@ def generate_ta_features(df):
     df_features = df_features.dropna()
     return df_features
 
-def generate_statistical_features(df, windows=[20, 50], price_col='close', 
+def generate_statistical_features(df, windows=[50,200], price_col='close', 
                                 high_col='high', low_col='low', volume_col='volume'):
     """
     Generate comprehensive statistical features for financial time series data.
@@ -160,23 +160,7 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close',
         df_features[f'zscore_{window}'] = (
             (df[price_col] - df[price_col].rolling(window=window).mean()) / 
             df[price_col].rolling(window=window).std())
-        df_features[f'norm_price_{window}'] = (
-            df[price_col] / df[price_col].rolling(window=window).mean() - 1)
-        
 
-        # Correlation features
-        if volume_col in df.columns:
-            df_features[f'volume_price_corr_{window}'] = (
-                df[price_col].rolling(window=window).corr(df[volume_col]))
-        df_features[f'high_low_corr_{window}'] = (
-            df[high_col].rolling(window=window).corr(df[low_col]))
-        
-
-
-        # Quantile features
-        for q in [0.25, 0.75]:
-            df_features[f'price_q{int(q*100)}_{window}'] = (
-                df[price_col].rolling(window=window).quantile(q))
     
     # Price dynamics
     df_features['price_acceleration'] = df[price_col].diff().diff()