From 3b70c93d284453594d67ffa94c845ff70ec9ce00 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Mon, 30 Sep 2024 11:52:07 +0200 Subject: [PATCH] update ai model --- app/cron_ai_score.py | 92 +++++------- app/main.py | 2 +- .../fundamental_predictor.cpython-310.pyc | Bin 4874 -> 5005 bytes .../__pycache__/score_model.cpython-310.pyc | Bin 0 -> 4941 bytes app/ml_models/fundamental_predictor.py | 74 +++++----- app/ml_models/score_model.py | 137 ++++++++++++++++++ 6 files changed, 218 insertions(+), 87 deletions(-) create mode 100644 app/ml_models/__pycache__/score_model.cpython-310.pyc create mode 100644 app/ml_models/score_model.py diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 41914f2..f039203 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -4,7 +4,7 @@ import aiohttp import aiofiles import sqlite3 from datetime import datetime -from ml_models.fundamental_predictor import FundamentalPredictor +from ml_models.score_model import ScorePredictor import yfinance as yf from collections import defaultdict import pandas as pd @@ -22,7 +22,7 @@ import gc gc.enable() async def save_json(symbol, data): - with open(f"json/fundamental-predictor-analysis/{symbol}.json", 'wb') as file: + with open(f"json/ai-score/{symbol}.json", 'wb') as file: file.write(orjson.dumps(data)) @@ -31,11 +31,6 @@ def trend_intensity(close, window=20): std = close.rolling(window=window).std() return ((close - ma) / std).abs().rolling(window=window).mean() -def fisher_transform(high, low, window=10): - value = (high + low) / 2 - norm_value = (2 * ((value - value.rolling(window=window).min()) / - (value.rolling(window=window).max() - value.rolling(window=window).min())) - 1) - return 0.5 * np.log((1 + norm_value) / (1 - norm_value)) def calculate_fdi(high, low, close, window=30): n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) - @@ -185,8 +180,6 @@ async def download_data(ticker, con, start_date, end_date): df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std() df['fdi'] = calculate_fdi(df['high'], df['low'], df['close']) - #df['hurst'] = df['close'].rolling(window=100).apply(hurst_exponent) - df['fisher'] = fisher_transform(df['high'], df['low']) df['tii'] = trend_intensity(df['close']) @@ -196,7 +189,7 @@ async def download_data(ticker, con, start_date, end_date): 'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover', 'volatility','daily_return','cumulative_return', 'roc','avg_volume_30d', 'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b', - 'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','fisher' + 'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi' ] # Match each combined data entry with the closest available stock price in df @@ -229,7 +222,6 @@ async def download_data(ticker, con, start_date, end_date): combined_data = sorted(combined_data, key=lambda x: x['date']) # Convert combined data into a DataFrame df_combined = pd.DataFrame(combined_data).dropna() - key_elements = [ 'revenue', 'costOfRevenue', @@ -275,37 +267,30 @@ async def download_data(ticker, con, start_date, end_date): 'propertyPlantEquipmentNet', 'ownersEarnings', ] - # Compute ratios for all combinations of key elements - + + new_columns = {} + + # Loop over combinations of column pairs for num, denom in combinations(key_elements, 2): - # Compute ratio num/denom + # Compute ratio and reverse ratio + ratio = df_combined[num] / df_combined[denom] + reverse_ratio = df_combined[denom] / df_combined[num] + + # Define column names for both ratios column_name = f'{num}_to_{denom}' - try: - ratio = df_combined[num] / df_combined[denom] - # Check for valid ratio - df_combined[column_name] = np.where((ratio != 0) & - (ratio != np.inf) & - (ratio != -np.inf) & - (~np.isnan(ratio)), - ratio, 0) - except Exception as e: - print(f"Error calculating {column_name}: {e}") - df_combined[column_name] = 0 - - # Compute reverse ratio denom/num reverse_column_name = f'{denom}_to_{num}' - try: - reverse_ratio = df_combined[denom] / df_combined[num] - # Check for valid reverse ratio - df_combined[reverse_column_name] = np.where((reverse_ratio != 0) & - (reverse_ratio != np.inf) & - (reverse_ratio != -np.inf) & - (~np.isnan(reverse_ratio)), - reverse_ratio, 0) - except Exception as e: - print(f"Error calculating {reverse_column_name}: {e}") - df_combined[reverse_column_name] = 0 + + # Store the new columns in the dictionary, replacing invalid values with 0 + new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0) + new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0) + + # Add all new columns to the original DataFrame at once + df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1) + + # To defragment the DataFrame, make a copy + df_combined = df_combined.copy() + # Create 'Target' column based on price change df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int) @@ -314,7 +299,7 @@ async def download_data(ticker, con, start_date, end_date): df_combined = df_combined.dropna() df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0) df_copy = df_combined.copy() - #print(df_copy[['date','revenue','ownersEarnings','revenuePerShare']]) + return df_copy except Exception as e: @@ -327,14 +312,14 @@ async def process_symbol(ticker, con, start_date, end_date): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - predictor = FundamentalPredictor() + predictor = ScorePredictor() df = await download_data(ticker, con, start_date, end_date) split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] best_features = [col for col in df.columns if col not in ['date','price','Target']] data, prediction_list = predictor.evaluate_model(test_data[best_features], test_data['Target']) - + print(data) ''' output_list = [{'date': date, 'price': price, 'prediction': prediction, 'target': target} for (date, price,target), prediction in zip(test_data[['date', 'price','Target']].iloc[-6:].values, prediction_list[-6:])] @@ -380,19 +365,19 @@ async def train_process(tickers, con): print('======Train Set Datapoints======') print(len(df_train)) - predictor = FundamentalPredictor() + predictor = ScorePredictor() #print(selected_features) selected_features = [col for col in df_train if col not in ['price','date','Target']] - best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=100) - print(best_features) - predictor.train_model(df_train[best_features], df_train['Target']) + #best_features = predictor.feature_selection(df_train[selected_features], df_train['Target'],k=5) + #print(best_features) + predictor.train_model(df_train[selected_features], df_train['Target']) predictor.evaluate_model(df_test[best_features], df_test['Target']) async def test_process(con): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") - predictor = FundamentalPredictor() + predictor = ScorePredictor() df = await download_data('GME', con, start_date, end_date) split_size = int(len(df) * (1-test_size)) test_data = df.iloc[split_size:] @@ -405,25 +390,26 @@ async def run(): #Train first model con = sqlite3.connect('stocks.db') - + cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%'") - stock_symbols = [row[0] for row in cursor.fetchall()] #['AAPL','GME','LLY','NVDA'] # + stock_symbols = ['DHR','ABT','TXN','LIN','RIO','FCX','ECL','NVO','GOOGL','NFLX','SAP','UNH','JNJ','ABBV','MRK','PLD','NEE','DUK','AMT','EQIX','META','DOV','NWN','PG','PH','MMM','AWR','YYAI','PPSI','VYX','XP','BWXT','OLED','ROIC','NKE','LMT','PAYX','GME','AMD','AAPL','NVDA','PLTR'] #[row[0] for row in cursor.fetchall()] + stock_symbols = list(set(stock_symbols)) print('Number of Stocks') print(len(stock_symbols)) - await train_process(stock_symbols, con) - + #await train_process(stock_symbols, con) + #Prediction Steps for all stock symbols - ''' + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9") stock_symbols = [row[0] for row in cursor.fetchall()] total_symbols = ['GME'] #stock_symbols print(f"Total tickers: {len(total_symbols)}") - start_date = datetime(2000, 1, 1).strftime("%Y-%m-%d") + start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") chunk_size = len(total_symbols)# // 100 # Divide the list into N chunks @@ -434,7 +420,7 @@ async def run(): tasks.append(process_symbol(ticker, con, start_date, end_date)) await asyncio.gather(*tasks) - ''' + con.close() try: diff --git a/app/main.py b/app/main.py index 3a81efd..952050e 100755 --- a/app/main.py +++ b/app/main.py @@ -3950,7 +3950,7 @@ async def get_fomc_impact(api_key: str = Security(get_api_key)): compressed_data = gzip.compress(data) redis_client.set(cache_key, compressed_data) - redis_client.expire(cache_key,3600*3600) + redis_client.expire(cache_key,5*60) return StreamingResponse( io.BytesIO(compressed_data), diff --git a/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc b/app/ml_models/__pycache__/fundamental_predictor.cpython-310.pyc index 64451da40b035ac7bf72f6fa8b51b28eb6f75824..ab98be12388f44aff2e7037d16e2dfc22368adcd 100644 GIT binary patch delta 1912 zcmZWpO>h)N6rP^hpV{B+E(u8x$RA0VBpU)rAb%vKAjkm}u!I`Gb=aOPBRxAy&uj{) z4Z#YiWw{6F@}Q@1lba>wLA`kOV0p00!<09zvM4WbqrBjIy$~pzsrmZ7e(&}B>G$Td zq2_Sb%Vd%YJWp!>E*CrQWVe&*ozdGVx7+L{3RP%=-TAS5&FwS$+WQFBpY&v&0$HW*lyP_4R^#GaYxNjxtC_0?wC0y=?p8n`l zr)>X{RkOth>SX8Eqs+T#vDa5@ZdL4eJdZim%HG2d?y48aTY<8!G?Wr)C>!cMq%^9W zR_>D%MuexKvSG|GTg>KR>SEntwB(}8Fe(>=gc09n%l8#t6bB;j$8?e<*LR4IQXS%( zNXMEE_HphfpDfPeHXRUww(s&r60X`G+8ANesgG zAw3Nc#(i%&aILFAI2G4(sG){22OGi<<6b>L;_uc`-!E0b0-i^3W|(2N#j6Eqh7z{| zJJid}sZ~Ahif>~JYZ=}TiadwVjxYcas%#(Mid-i`0U?jjh0uoZ8bBB?dv48PwxRPe z+?>Hpay5+Bs}6*fuz~=4*sJnc6ptb-Arzm(3?D~w1Yi(8e+VE(l3JIRCP|`eT_laK zmeX=1t0iGg!&}fI;zp`Vd=WRvZt*B?EE-@JW|p0x6mYBRFGG6wRT)%B&c1Lby@hQx zZr8Y1wtc@ut-v}d&gj?XvC2@cIB(4qp8%d8N7#(EWuAMcQ35X|<_GRskx4uz)8c7j zNk4+8X#9*gk}OUy08@E<>CaEUd-B&JOxhXX!`QM_4m{3H$+%9n#JIEZF;O1&!VaGk zUnggW-$rr=!ghq62w1*QgB)QTtrgpd@iC#N4sL%XycARb4wa*O?tzt>BB@hoj`{KP zVk7-taS=~SKo!c454RI0e7q(Xz({iG`Xnm%AdHHE%mf(|?`F>Tyy7WquUM=OJ=JpN zh+i|Oh%5TDe~_Yhnl1KAeZ1f>xxgGhz;;>u1b81NKJpyJ(d! z`>j5Z9U+^cN8}4VF${$FiT#D0299wUl~!_w`7V^&$ZSHc3iD{FjNiLfv*ED}xqcsVQt(HP421K{PbG_M1x1GI< zvzyw)loKwBhzF*FpgyIK0Uv}?uoQgpNs8d}>Vu%5eJqIRgW!CVnwSXrX60VwRx|jCPked@p`RZ$;atY zug~i9`mKI%z#5Rc1a0^7R$lT+I_Q~}=?z&!Qclrh-mo<+`7|Bz3RXe#h6r)oIxhJZ zI_iyDqms|i6W*9LCV5Oxj9cU4KA28e6Ov9^laijaPD*;pI`twVS<#m610G}@w^!c<8Q9GloY)x zacI~i2}_6qg;Bp`Q-_&2jQ9>MZzDFyHzKb`b)3L!hxo@vj{hCW)l_j-6KhT^e-R@T zpgO`Rz;$#3uj4HxPzfeV5ZP)X>Mcwn4S^WRohWIl<6S9MN4vaAq6f37?K6rU%xIs@ z0R!!`*-NN)@Qk$oxtvOxiB94T1+ot`k|4wR}!WeV>%l5JpHc7%} zh5Ek^xZRw?7x~-G4{?FJ`i#~Jb}GBTztl%2;Wx7M@bb?e-1+mDX*0qu2sw zY+eeUTPafJZoQAugIS?rqdXIz%AW!DDS&o>4ge@lsEVw_YqPI8W|ZYQi$CABzm8#C zLvan@^=)#y0;b^&=kSZhE9mo`)`m`f2L=0rY&j#=l6P&s_oEF919Z&;{@s*NuKd zj>9F87Xbo3W3$& z9j<`ppZnKHxsCzUy z<{p!IhaFEQ-N|Iios#(qdoFq2eLgwio=8r*CzDg|spPbKI+=E-lQZs_~}eX8uGLcvsq~*4fSKL>USKU`-Sq*bsa4$%`&R$Dq+?iz7ot60p zdp)`6UX=PMdn38zUXpqf>$vP*miidGl3a1GNPV1rCwbF-Q|c;v%Y93p+1u{hGQH|v zmFYX~J2IVf=WZ+Ek#ORX5>9Mt?sr3VL1`V`rJI;E zG|ZCsG9Kl{P3Gr$+>RrTn)?0QHx3k-VzkSnAkO0~_3{AiMGbkMu|dvhUmR$4o&}!Y z3kKPS9|S$_2Ww>>LpKZV$EjCfN-yuSxWM{qcjEMp|8M~#MqD%&3arcLVVOsLbs<{q zMQIWHOqh2vtiLrP%pauPULl-2JyyhBwkGVmQNHANBVoM1aPN*VZ$)Vy3F~H-KDc;G z*tdAr&Cp@|hF=6r^BGTk7O(pS_9TwoX4#_8-hB{pzY~3!Wh_oR=v2e1_H*?caTsIY z)Wv7mi}7ygCoNsnzwdLlwoqiQZuwe2dNcS z!YWcb90_Ykop3B{grkpC*k=<+8^@gyDn8^9&6)84*1q?UuHn6jH$MhZD4R-O@zlPu zt?iHoPq3goR_9v=uc2O8d5MIP#a_$?XC%z>sBr3e#M+M){u-J-nVVh8l4v&J{!*O8 zvw4vPD=E%vcF_-3qBNZKyWQD@mFJw#O7NDMneLi!JTFe;!t>71+6)LsZK&(VpWe;P zu+>xBa?skwE4TGE6xx=m6e=C&mb#%98Xae$Z{s+3)X>_}`|75;p%pmRO|`E{-NI2w zEp%=vpH6M)h1NgZt88h8m1ldco+<3Gx>v5@f{d`XrO7c3te}pIqP&6GjKZPW`bMW5 zu`--nwj15+XBO4I+Bd`ImbPIPBe?AG@CbCfZ!OpQb~q6p?Wp0gEnQXm7OwyJd`pc# z#(5AJr3uCg&QcgWV!h6fC;o8zAM1~1Kjju#JjhJ@;%$-Z}`P=oD9us1A zahwXjjHB)R=GXN61%t@^E@4V)EtS7P`V|Oa=UKZ*{D)BDPLjnT=CopR6S=Tq`=oqT z4)y2j9rVESFIrfgOe}@%dK~5{oPfpM6dpnxS&UPkuXzE_@*ExrJ_M*4IyIPnCF{ba zk|_#|CmZ0$NsN;iBQZ&0l*IE8!Va>e8?&h8 z@C#J<78R-wh1mmKb5Ze&g8X5Y^0%q_RT9@pkWrLkhO}28&f(2pg0R%8KB?E#s_N*I zYK?w+Q*Wvby^6Gk-?(mQ>(3oR{{bTm$c~2(=_X!j0Ix$Slx-ES)>n6sE(7j6I?O=X zHmGC+_P%UxTLT>tioc9bEn8@5SD2qIMF=YPE zHr{2}`Ba%|Qy?;JeTRasKF=jK36B7ljnGOliPQP;J0zaIA<-PP@USP3l z08^iN!?Hm-1LODed!^Mn570o`j-JG6%Mgv+e-L?##1OI+fxRO0Z62+{x-u9*24kaW zuuCzlrSl&l5l$D5B}#)xOyHQ%W-!EH%CFD{&)=`_&LQdid+EzB{`B=fuOZ~f-c}pF zDdXHlrLeFLkMVf>=c=+hjS_wp0xPqkZWb)%qPj>N!pmXzqKW0lfEkRjBs8Kz%!uMo z-m>`z)cqt0!VW)0g3MLuKr><)w%W2vB-{51-Ek`HwSiI%_DMtoE{cIi4OvD7!4{+f z>pTppk&)iN(^+1$%9Fdx??Dv!l$2PI9=NY9tD71SqZNcQZA<%w@`VgLeQgKj(BaI; zU45>!RY`IkAWvNe54#0yZ5Vw6_ixUxkGCVg=<#S89J3_)SBbae#BMz}v_PmUPq9E+ z5v}Y9t!x^i1Izfd_KCIuriXbI5M0Cw4o3DXZ51e!=SU>I6X64>*X?G!0FTgSUKeUC z^dMu8RpFG=!yn0LL7`k!=YXYxc)_*LX)+2C)5`kgLq|E-0KOLn8UPh&9($0{GAn=JejIg2-A@;&kgA##|uwIhpEwdyf#GW_>8U9o1G*(U*h2CKP zyj=OusD)hq*fR^Gj(G!uI5)A8Dxd=XG5!B`1iC39(y9`HPZ480PgFeuVvmtt^Ssrb z&&nD;OKTt^bilqCWq(5Amn8OM^4@v#^Qf4{n-ha`ut>*nE;zQMJB`M1S=+iMoV7Lx zA_X07tu3rBeYoaA>xP`JhZ8zPGhdPPa)!P~lSO>u5@3Xhv+b_&+oanMJBwMC7o31D zMu!~+RI;>q8&#ZG45uJpI)LMQE$K6I|IYacjdOHZ3o9Pb8&F*NiN)*GpeDocj0_Yx zr>l?@bXd;lyD~2stR$(Dt`gEqLW@6qlmlc|QrIafdk#W0%fa{fab7ap(Xx0iM3!Mu z21pqj$@7V+;mV%@4WFh72(4lg&M?n-n`NJsd}3zs%_oVsFy%y&b@Q{-=^S+$0geZT zk#i!&!uIpEG>9`Oakf+xUGV!jYlD2}xX?Jj= 0.5] = 1 + test_predictions[test_predictions < 0.5] = 0 + + test_precision = precision_score(y_test, test_predictions) + test_accuracy = accuracy_score(y_test, test_predictions) + + print("Test Set Metrics:") + print(f"Precision: {round(test_precision * 100)}%") + print(f"Accuracy: {round(test_accuracy * 100)}%") + + next_value_prediction = 1 if test_predictions[-1] >= 0.5 else 0 + return {'accuracy': round(test_accuracy * 100), + 'precision': round(test_precision * 100), + 'sentiment': 'Bullish' if next_value_prediction == 1 else 'Bearish'}, test_predictions + + def feature_selection(self, X_train, y_train, k=100): + print('feature selection:') + print(X_train.shape, y_train.shape) + selector = SelectKBest(score_func=f_classif, k=k) + selector.fit(X_train, y_train) + + selector.transform(X_train) + selected_features = [col for i, col in enumerate(X_train.columns) if selector.get_support()[i]] + + return selected_features