From 60cd644afadb43d0d576f5bb88c0f53833d532de Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Sun, 6 Oct 2024 00:26:22 +0200 Subject: [PATCH] reduce features --- app/cron_ai_score.py | 43 ++++++++---------- .../__pycache__/score_model.cpython-310.pyc | Bin 4004 -> 3999 bytes app/ml_models/score_model.py | 8 ++-- .../feature_engineering.cpython-310.pyc | Bin 6816 -> 6496 bytes app/utils/feature_engineering.py | 18 +------- 5 files changed, 25 insertions(+), 44 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index bbeaeae..563a355 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): #Threshold of enough datapoints needed! if len(ratios) < 50: + print('Not enough data points') return @@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): # Merge the data based on 'date' for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth): for entry in entries: - date = entry['date'] - for key, value in entry.items(): - if key not in combined_data[date]: - combined_data[date][key] = value + try: + date = entry['date'] + for key, value in entry.items(): + if key not in combined_data[date]: + combined_data[date][key] = value + except: + pass combined_data = list(combined_data.values()) @@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): fundamental_columns = [ 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses', 'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense', - 'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities', - 'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased', - 'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments', - 'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments', - 'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock', - 'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments', - 'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings', + 'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt', + 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets', ] # Function to compute combinations within a group @@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): # Compute combinations for each group of columns compute_column_ratios(fundamental_columns, df_combined, new_columns) compute_column_ratios(stats_columns, df_combined, new_columns) - compute_column_ratios(ta_columns, df_combined, new_columns) + #compute_column_ratios(ta_columns, df_combined, new_columns) # Concatenate the new ratio columns with the original DataFrame df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1) @@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): if not df_copy.empty: with open(file_path, 'wb') as file: file.write(orjson.dumps(df_copy.to_dict(orient='records'))) - + print(df_copy) return df_copy except Exception as e: @@ -270,7 +269,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) - + train_list = [] for ticker, df in zip(chunk, chunk_results): @@ -324,23 +323,22 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) # Check if the evaluation data meets the criteria - ''' + if (data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50): - ''' - # Save the evaluation data to a JSON file - await save_json(ticker, data) - print(f"Saved results for {ticker}") + # Save the evaluation data to a JSON file + await save_json(ticker, data) + print(f"Saved results for {ticker}") except Exception as e: print(e) pass - + async def warm_start_training(tickers, con, skip_downloading): - dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220) + dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) async def run(): @@ -358,9 +356,8 @@ async def run(): WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%' - ORDER BY marketCap DESC; """) - warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()] + warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()] print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') await warm_start_training(warm_start_symbols, con, skip_downloading) diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index bfb68adbd9cd285da2a09a5da4f9d6dcdb86c545..9846f133fe714b30d3128b6280d2cd12db8692d6 100644 GIT binary patch delta 640 zcmZ1?KVP0VpO=@50SIjOFs2{c$m_r)$(F*I!UcrfDLg5>y*xDx@$4xK!3>&wlhc@d z6J>!E0Fh69!NZ&H4_A0+Hbze#LdX4x%nxx8>5~QQ05kIT4qUn zNl{{6aaw*+ZjmxjY$a2X21trAqDT`+LkO+Ok*wK_I+KsGTHEP@MD;*~K8P>?5r!ZF zY$7MBX+_2$8DxX(*yI?^CdaaUWHg=Z&n_mV0#tB|CBGmw?-pA@W^#5;>MfR>{KS+Z z^~v?@5~^Ugut$I#nVENsy%Ne`OhYkr5Bt?XOQ1nj0>1evsW}R1nR%(YC8c?(DViKb zrXVNSfe4UYMfM=V0Ys>S2uBd%3?f`W1d>xXKjJ80QWOS?rUNk$ys%GC1A}Mw>BeC2 z*uKbnaxu5Qo+wcIB`1jB01@mUf(=L%34&NGAc7f0FoDFpCSTwVXR`n?)hFxnSTZ_K zPUlhN1ACPX=+)HXk|Mv!(|E3PBe`aB1@BTlZIB+Yf55suKwM7{Q2-(ks=+L+$=-af kjJlI2^3^e_P8Q%dR|R?f7E4KHZfcP=NCO{MT{-*~0Qqf~7XSbN delta 645 zcmZvX&r2IY6vt;a`@^`qi9)oILSvg6lh&%V7Nez!)HYf{K|JV97B)%ukj*kXf<`HL zuh8;NdT1~8;H6M<=(VR_dMa3ifkN9$+5e*Rl0qXm3!i=See=HWy?2@S8M|rQ{Y0MM zC$#?MqrISv#NB?^l%M6=ZsN!~AlrsZHb@}}HDxvDjjj<=-%ov5`qkp)$daWV86f1( z$uXX&9HpexK37ai^X;Fp3Vm=>3c{{=e48`J4{J@6Y-Qn4yAHehxAcs>3Z)qe zHE{R~p;vdkkcTV_j*%|qFd35r3)S6e1VRg#=kqhQJFc;Xdw9Mu_WlL5cB$Pb<2xY5S_VPHlO#EyX5W+7?6O(!a`9L(kWJgh>Zm8hJvOF|kKW_I7qdvA9~t$nYUXNHjnUhA*eS+3uk-OVxy$_%hoqzdII z&nJTlR2)FdB!kS3Mi$xRP^q0JmptxUga^>@pPx=`oR_{+ujSY=%dy3AG1LGeP!L03 zM7RydZ=)k;h#Ti3f=CNNnejLxK~|HQ;~LItb`vf&RiG3(yr-xnKiuk7$psH$WR1&x zYoeNJb@E~i;2IgH*gu_MRg4pFeTIz~0;mEzl$+CAC;AiSlYWazB#i&HM6*DZL_|K@ z@l)9(U@Vk9{-oe>grghpN0VQEA02rZ>^nMj+wER~C(^s))X2%tU{5ZEtC?Mf*=kl& WxTu@Io{*1W4PWGY_`ta}xAO-L_+yCx delta 764 zcmZ8ezi-n(6n^KQN$fa3>?C#qO<(|JKu914P(%kR)(XK?sj^C5k|v}Hanp8jm4Sj- zK!DjQB>O+0>dwvtE1nqn4`4ITv0G42x_9s0ecyNAyZd>|zoSj_`5e$Vc+Cc1uf5lX zcMX(KK4KdxRxpb>%JQh7dITF9ikKg8s9^z%Sn7*dMxD+J$R0zxdh+>M(QB1jS7N~o z%?C7h@ACjI_z=LMuPtD+ewe)=Sah>XF@0WgvuGIb zf*k`^mNcP5=4Z-GN;rNwfOyi?)lS-m&(8O19hh^zf;^hE@@NIjpw{A?_+-$#p(_hZ z2P3gxC%m30r}_dm0%nqf1n-M+?W=U-vZ1uj2VjyexwzCxT_pSw;m@9wo-=*se|s_K zDqfCBxMD$UokW^cOTMxEEs1`$b^1g9!#z*6-R1`L`82+-rpd3>P)_(WN2i03dEcrFR`J$rk8aA{SaGr&3Nbs(%=We> z^WnJXY{f_chyA{zQ0ly;(9U@$wRJvZvk&HN`6=yW8aeNtdA+ViD}oy3M3Yk^lUJjR YWN<4H7a5}0Gn?#-_tN}Gv2H#53kDUyB>(^b diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py index 46f72ca..aa38e2a 100644 --- a/app/utils/feature_engineering.py +++ b/app/utils/feature_engineering.py @@ -103,7 +103,7 @@ def generate_ta_features(df): df_features = df_features.dropna() return df_features -def generate_statistical_features(df, windows=[20, 50], price_col='close', +def generate_statistical_features(df, windows=[50,200], price_col='close', high_col='high', low_col='low', volume_col='volume'): """ Generate comprehensive statistical features for financial time series data. @@ -160,23 +160,7 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close', df_features[f'zscore_{window}'] = ( (df[price_col] - df[price_col].rolling(window=window).mean()) / df[price_col].rolling(window=window).std()) - df_features[f'norm_price_{window}'] = ( - df[price_col] / df[price_col].rolling(window=window).mean() - 1) - - # Correlation features - if volume_col in df.columns: - df_features[f'volume_price_corr_{window}'] = ( - df[price_col].rolling(window=window).corr(df[volume_col])) - df_features[f'high_low_corr_{window}'] = ( - df[high_col].rolling(window=window).corr(df[low_col])) - - - - # Quantile features - for q in [0.25, 0.75]: - df_features[f'price_q{int(q*100)}_{window}'] = ( - df[price_col].rolling(window=window).quantile(q)) # Price dynamics df_features['price_acceleration'] = df[price_col].diff().diff()