From 57a18fbf9eec498d4dc73d1c5088d2cfb5c9d970 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Tue, 8 Oct 2024 10:59:25 +0200 Subject: [PATCH] bugfixing --- app/cron_ai_score.py | 27 +++++---- app/cron_options_gex.py | 2 +- .../__pycache__/score_model.cpython-310.pyc | Bin 4363 -> 8601 bytes app/ml_models/score_model.py | 54 +++--------------- .../feature_engineering.cpython-310.pyc | Bin 6863 -> 6308 bytes app/utils/feature_engineering.py | 27 --------- 6 files changed, 24 insertions(+), 86 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 565c358..d8007b6 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -80,7 +80,7 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75) selected_features.append(feature) return selected_features -async def download_data(ticker, con, start_date, end_date, skip_downloading): +async def download_data(ticker, con, start_date, end_date, skip_downloading, save_data): file_path = f"ml_models/training_data/ai-score/{ticker}.json" @@ -200,6 +200,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): 'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets', ] + # Function to compute combinations within a group def compute_column_ratios(columns, df, new_columns): @@ -240,7 +241,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): df_copy = df_combined.copy().map(lambda x: round(x, 2) if isinstance(x, float) else x) # Save to a file if there are rows in the DataFrame - if not df_copy.empty: + if not df_copy.empty and save_data == True: with open(file_path, 'wb') as file: file.write(orjson.dumps(df_copy.to_dict(orient='records'))) @@ -251,7 +252,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): pass -async def chunked_gather(tickers, con, skip_downloading, chunk_size): +async def chunked_gather(tickers, con, skip_downloading, save_data, chunk_size): test_size = 0.2 start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") @@ -267,7 +268,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): for chunk in tqdm(chunks(tickers, chunk_size)): # Create tasks for each chunk print(f"chunk size: {len(chunk)}") - tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] + tasks = [download_data(ticker, con, start_date, end_date, skip_downloading, save_data) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) @@ -309,18 +310,18 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): print(f'Overall Evaluation Metrics: {data}') -async def warm_start_training(tickers, con, skip_downloading): +async def warm_start_training(tickers, con, skip_downloading, save_data): - dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) + dfs = await chunked_gather(tickers, con, skip_downloading, save_data, chunk_size=100) -async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading): +async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data): try: df_train = pd.DataFrame() df_test_dict = {} # Store test data for each ticker all_test_data = [] # Store all test data for overall evaluation - df = await download_data(ticker, con, start_date, end_date, skip_downloading) + df = await download_data(ticker, con, start_date, end_date, skip_downloading, save_data) split_size = int(len(df) * (1 - test_size)) df_train = df.iloc[:split_size] df_test = df.iloc[split_size:] @@ -345,22 +346,24 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, s # Save the evaluation data to a JSON file await save_json(ticker, data) print(f"Saved results for {ticker}") - except: + except Exception as e: + print(e) pass async def run(): train_mode = False # Set this to False for fine-tuning and evaluation skip_downloading = False + save_data = train_mode con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") if train_mode: # Warm start training - warm_start_symbols = list(set(['APO','UNM','CVS','SAVE','SIRI','EA','TTWO','NTDOY','GRC','ODP','IMAX','YUM','UPS','FI','DE','MDT','INFY','ICE','SNY','HON','BSX','C','ADP','CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) + warm_start_symbols = list(set(['CB','LOW','PFE','RTX','DIS','MS','BHP','BAC','PG','BABA','ACN','TMO','LLY','XOM','JPM','UNH','COST','HD','ASML','BRK-A','BRK-B','CAT','TT','SAP','APH','CVS','NOG','DVN','COP','OXY','MRO','MU','AVGO','INTC','LRCX','PLD','AMT','JNJ','ACN','TSM','V','ORCL','MA','BAC','BA','NFLX','ADBE','IBM','GME','NKE','ANGO','PNW','SHEL','XOM','WMT','BUD','AMZN','PEP','AMD','NVDA','AWR','TM','AAPL','GOOGL','META','MSFT','LMT','TSLA','DOV','PG','KO'])) print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') - await warm_start_training(warm_start_symbols, con, skip_downloading) + await warm_start_training(warm_start_symbols, con, skip_downloading, save_data) else: start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") @@ -374,7 +377,7 @@ async def run(): """) stock_symbols = [row[0] for row in cursor.fetchall()] for ticker in tqdm(stock_symbols): - await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading) + await fine_tune_and_evaluate(ticker, con, start_date, end_date, test_size, skip_downloading, save_data) con.close() diff --git a/app/cron_options_gex.py b/app/cron_options_gex.py index 4383e8f..f44274e 100644 --- a/app/cron_options_gex.py +++ b/app/cron_options_gex.py @@ -367,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal") etf_cursor.execute("SELECT DISTINCT symbol FROM etfs") etf_symbols = [row[0] for row in etf_cursor.fetchall()] -total_symbols = ['SPY'] #stock_symbols + etf_symbols +total_symbols = stock_symbols + etf_symbols query_template = """ SELECT date, close,change_percent diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index f11a41b61c63b681e062e2245b556042e16a5468..4bca0ec25617bfb740e6f33aa7f501bd3a14b023 100644 GIT binary patch literal 8601 zcmb_hTZ|;vS+09mS6`;5=f2nW`jRwqX0pU2*2d1Pv(6gO?s(Y?cG|H@HGOJkx~Fe@ z>h!L+sRpAlSO^w7krE)bQBOd?O7Mmd@WLAs61)IH;wS>7Ak4fV5Ik+1nD0MTJ(u2b z9uU>)zs~v3rB0pm|KIEQgmo%1P z?pqpj!?_)^b{hFC^N{z#>E;<|&O(!8d1&%GMyA8r_Z` z)w?{DdE|puE6uqZ=hIYyck6!8ucz5!P_OrSP`{VttGdizZFH^%Z`?rZkRxwiyYvEX zmblH@RYPW83`7uxLVA?A(hXS1>okhPmx;_p{G+EJLN)(2gSpIL=36>e!D9AXTFqh( z%iwOScgf1arG6y3?eqa_b?Rk&QJVgj@Q;=ugf`M+eOHe)--r#S^>to`&WsJEvzT@R zD~eK;dZ5!J*`mwqTQYaK)eXeWuoHE;)L)g}m2PKqoV`~yr4@y(4PL@icp2h>#%FPR z_}t1?w;isud9c-JH&!s#`gSJ__{w@v-wr!$CFu25+AY7`Wnn8?QR|RomV5W4>-&vP zL-_tQ`ijU(^ZGe`Fn{Ewm&cXrIx37WN*nmAmnZ8Nc(JCw0~6o{WU(w}E9|HdKhXIM zRBFVDm_t@(d>=J@pECbVeR=v|Zt~HpMx9E>7p3>7{nNG_jqa z9s0iXe46fli_&@Dzuga7NeQ1vhkS{|DH61thXhRu&qL6XM=pem$GcYERN6WeM=qKX zx}ql_#^(LDwxKhf8E?5TaF{q5_}iKV^M<3^(tL4k?UAV^>!Qx0UxpBxfHm(JBV!k6 zA~j+mvk8h`^MdcmhSK!W*#ZG(sDB`H<^8;+)2yrwA^NyU1PTZPsx(c zRNb{bJYM_obL#rL)x9?{w#{$<7F~a}x_1$|50l)Vt?rS1R6S`&{a(m<1vxd<(&%)g zcO$&r4?Cg}w4`&TL7pP5>*1AKz!2W;b^Ai*z8r}9)>@ah&oi0ngyL?OZ;ve{+vxQA z!e@>416^i)zupR>$oCJ7T0QCvo*B`Di@F7Sada0y&zQV>|2fEG1rGe{6Rn*5E7nFD zfy>yAbxx2o9+>`!0cSS4{BFS6=TM8EfEeWHVX3z2@Y7U4L*-{lD08IBvXH)4Jp@*# z@20Htvs6zRW(6r~ue`zqYLw>GOdo5O7pdY+h_P8Jlhh7OlG)5rn5&{q%}Jt6EtABS zn#&5Ti1KXcjdHshn_{K6v>m;cXVb8avMk=9=yHvRtWhU4orEbHkG@1>_%4Ldb}*H` zE({vQeZp!}SR-3lO=pzZ)%XlEJ352KqU1n(hS|_M(7F3HpA%V@Q8|yfyZU?D2gZ($ zXiBYe&AeY=PMRuqEY!`aQhNFn^B_xrtI~UjpdT2>O{clM%9X`TbCxuio5FnqzBe9U zofR<>K%GCw(1SnE(9&N}eo|iB+)&qltWwnM`1NkP*X_XIBWcz9ERbfs*S~prkh>K% zI-8Z3uHCv&;WS@^Y@)ez?bhJLLAFzR`$;-`(bq;}Lrmh{pfmX`3HC_^-dcx_)i7y2vw7N+(j0 zofFJSvsVvzQK?yYr%@027buf&M1F85XtaX$Rwy(5DD;DNPue}+=m(eW90=5=%sQRq(ADX&|W1)O(&P_&D>_pxP`eqfn{ zCnnbF*sdN4!uyc*Ec*YyE!;Rxxcq0(2dB~U8zcyc`7IJO^)h>O`qF@sFwFx_LZ~li z4-Ydr^CR{cFb18D?UqiCEG!X;*U}nmBVLpNYZXs2%Fq{Dx2w@0~K`K`y|;hAzC= zjWavOF#E`4KYwJht&tmNclA+L^=FMdct5;6wkSsHTBm$mpK&&=Z;m|G&W0wx3-lY> zx3jS=3Nf}2CuzZ~H5Hg{f@tMC*g~$iu(bhH<^UTf@>%V_f0JYR zRw-{%bZ~<{c0@rWfC?01lCVIu2pcL9_N3GA!H5A3#Ps*=|o@uZ71Y4&;fh2wGUlD3Y01eH8*^=H55~P6^70_)8Tg%_pDYiHa3HJU#j} zga_J~H%fX*fxoB1z^NY=ElMSZp^D4c{BHNx;6| z1f5^f0OA1f9UZ==HOKVFX=X+)D3`u%@uxvy43TO;Tg(qo8^i>pjLaz|Odc)g&=Q4a zk+h?!T{5;3vtW)CMyC0fMR_!{tAWI5Lr2WUdThYlGGZ>ypnM*L$5u~S02%vP=Aa&E z;eC7PqR++VQfw1FBx-oy8fL`_R@l*po;WE^q4ntC~HU7JC9_=bK;K09VigpkIDi)2b+@RZ)+sM8TbqEmxo7ucH!wg+ztK1_|tw$UOlR)$DU>eYq^bXCrMU%pRlB(v_K!J1JelG$RQg|ZRNzf3m zqY{csaRL;6n}mw=g#;ZaXydSqcd2FeVB}!L29Q5hS(<6r0z}KrTZ!fNUg7y7T>6*EPyWAut-2{zO{hP@Rt~+xmVf^Neo1 z36wiGfpW(@_6XQf)TM2O0=s8Zu!AE5*pl-(q7*+8>MA0Ocm@DSLCt9r#O3j~uxB&~ z-l$Y$n;u}}0;QY9(Nt^^?I4KhxTHI*L60Sj!4}g^L`IYW0&$;Z)+~06DpqUFIbIX< zBLJTQ$BbA^!O;~_g8#}3P&p3AZT$E-NTJOzDchZykBmM*W za!6Z)0}( zlO*1NxXA~UBu2-74r1@eDfJY@VCFK;SUw~AwEw9nM5(+s$giCL>^?B?U!{7bt8_kD z*+^@Lta_HOQ!hS=3W-nhe@-T+Jf7dDR6v4Uh5rVL--M7;iliMl6#sQ9dWXcjB)(4K zsN3^zkd91g!r%F~NT+=FB2v|S!Vupl?FS^(HpT-W)VF(G>`f=!kpDJSBIo1#ZZ_#z zQ#27irY%gipMQ$l(K8UP;hANVHiIN*^^)NlCyw#0yi4riMB+bZCj4ij^CP}P=aDY{ z%6I-42U4&)?C1I#f1JFByyy|%(S+T^$q06cz1j~M&S+H!J3L(Thn;Lj6C={@mO4B`B|KWksqgYtip=WPaUJ5g1&T&z6AaBG5Trf%a7>KVdUi{JaafBD&q0c z6Eq?=^FGdi2{$mV*?4Ar7Pbkh^BsLSi~Bt8bD(_l;>mb6p4&07f<^3F7KV%QOuVpd zzpL{v#*0IIq3Rg#>bI8<-U6xgJ&{V^0I77$NTs8U=h?#hPC}u62S%)L>JU0%H+O~9 zLlQqvqE6yv2%J`#IMwBEB0VVGL?on~rbym0FhT7HZR-$EP&W^_t->=oJI^c8GMDwG7L0;D3@T@Th9t%kUF zp;GnI!|lD*O*|Rbzpthzd;hk&nVf=uZ&eL;^Kbr|uJ5j9VaPR#-8R^ zklrH-vPbT{NAYpbf#APLf&hHaq!ht+G9&nkG5!!IvlwaeDUU1usj3o@q6-sT8P;`y<9*0o*_b5xSF0JrR(3*tUFHkY@ssjeb8&o?JbvHyictbk)q=heA z{8`f4;JLkf>WD)+cYyHqZB&TFJ)8i6$qEh#)3`(8=)e_G1RVouWc-`dmC<7|OPujC zF+ph#8olaL!mk|gE36*>BkG6no&P?y$dghXg3!muoXU;&i}an7)->V~)$ddN!oeap z!bp}>g996g_n$9{K!Q zPE<#}yw2ithEN8YJVuQ}dSZ)*(N?#`B08zXa!346=zX&L1=7Y+_QT0IR-_f~g9RsH z?4QvnB2)8B7ePyrKROrLCx)PfA6XeX<8&1Ks{*_jo%MFdtB4doriNo>{EE79woM|Tv6FJ`J*y*IXA8e<4TX_LBNu;_Wd#z zs;>i4Ls2?H1+_&<-o{m*rI$OMWG|IVvTDbqcoU45zFRN%MWYoh)0TqM?oV+g*rqC@ z4-U&Xg;CoywUIja@r&)=a&x1AMKqgq@hVVZuX@6T(U9x67kIBq#RRt3a4xI Vqf>Kyg@R4dw>DUhQ7)91{ukK2Y(D@1 literal 4363 zcmaJ_+ix6K8K29}?Ck8tYsYr(>5@yEt%$9NdkA$Bmj=o8Eh^Ib4spKGD^1r|fNJt#@0jUzOFG#2o4^c(=eP?EU!(~SM-Ola%zVn^$ ze6vZtZYy}2AM5>_(~9y5G5a4K%nkffQ&kkEFf~*h6<7)f zv(9XE+&Lbda8AfJJ3JYka!y63ozs%9hG(L)&RI#T;W_6e=u!(?QOjvbSv~wxbly2% zv|n&8NLwyC7bU*rT$1=@=VggsabA)5vUB;i!WwMieT7YIYfhV~%SyXBA`dOKHDNI? z^RpoGMTH>q!^~@|qW<9at;Ntw)1Vjl9Q~%2Gq1ZA#Qi;~n7!x4EQ#JoxSwWwatxn- zbIA|-E1iU=2juo`Zyj@b@zO2y(wBd<2sHIp(ht%s=-%`Dm?;gCxNV4u0r$HBF>cz$ z3Ze$y3&Vo5dRGgo&Xca|t#=Ey;dQ&~-0N=SJXEjU3F156lVyzdxoE(Un0cJ#JgISa z@g`6yd&nZ-D)W2ZdYG}Gn{~-e`yUO=4gAvU0KN<)M`J3}-d7x*>8u1?Vg@sT4OV6r zu*oXS1}?KIs{vcAjxA_F6SC_r_i3escM+bu578EWi}o_Twxa zF&HtTliL66`M>^nr*n5<^eQ-E^#|+j3iiKk(g;LS5P5J%Dk_op#AW^`lu`PwKx`9+-d8=-lyn9FNnQxPj$K0 z=6n>oQGx@N&OfI4vUJ`HuEKmn#|<15yFt_iJ8Y9kI0ui%SX5aFI%A#k@{ip zxx&AJ&M&UduOyK_A8~IbhyqxUbk}0+%6tc=_%WOJ27`SjOFQ%F<^~(Wa@`;fGS@wY zxzg7FEVZU;x~A^E)(g$TTS%-KMS9M^0izFqIY~fU`8aw7LUC!xc<+-}lwy^U9VLV>(k&Fyv zrWjEXd$F(q+$?e9^@v}=BBI`dr5R#0?Ik?obC5mP_}8gPeIUJ=1JX#$AkQHhyGVRA zCWu{RWbaKheM%cQtJ-i>P2IeB$lxP~eWCUI8s_5P254KHau@#&0on+@K;Q-eseR?3 z=KL15PzdZ(dV2rrI0;hcA5fp~0&HIRKe~>4cj;apKNR7+>F#G>hRTkTZ$qZ8YCA|Y ztEFcuznmFFF?Cz}Wn)Xv%$@R3e@4mBx{o`&gm)Xad2C%((hHAORiXD?{4mPesSMSj z_DoZeAa`t%6tijDxwOjkZH$qrth9tfe~rYYWZ=ib90c99&=;jJ@fiOqHRuQrQDW;6qG`Z`ID1~ogQNT) zj{s5LCq0 z4yzw3AIUxIqaOXOx>WNjXuY-dG;+26C#~a5r$$D zszy1|HqlLayLHWl*QR4h6e6qkDwDte+dwM=zC`SBc-0{!jj(=L< zWK7%UCxAwDZI3AFjVQ84bgA=;L{TyvQFa{B{l(t}Ky74=>snEPwz>JdP8}$+c>_Qg zDOHs1I=@XKnQZEXX|A+On6P$Z;0qmThgYfB1c8I|$de@uo~*}A=$pZyJ((vc6A23b z7bfx(;U+=MJa%3xX1esz;9kv_lLc1br(&vu3W;wV=-L<$&TSEO$?sa3v%6LPq-;I# z2{4t54pjfsSbo1oQV(HgEBLTjNkW!Vc}3qVyhjeJ>^2DFkr%G}hf_(zCo!tbJd>XnW&BYj3IoN>U_(|8qKry3NzyE@OPeXKHx%b{ew_MI z2cKB7z?3!WO(iE87ArZx1)C$zB$vOF}g#Tn;Nn=K@ c$&Hfk914sa%q&br>n4AZoWi(na;KCV0MyDLxBvhE delta 673 zcmZ`$O^ee&7=9-+O+W0{cEdK^*0t*=ErNIuK?;i~di3B;&=5M=(rq`XlT<+>J$V$A zh!+tO@DIpdd-CQ<@FENzy!s2gS>I{5ySX^NZB%BJ5W-9Y5)$fsJIsE7k6I7u zq1J$ypd=sW4>Dnvv)3ffCxEQx3F~xHSWa5&$y5E>ir`_9-Ab};l@X{;4#^fz80h#GFPG_l2K==z?kP8uMD>(`#V z$-If<%Id%thqh~t#=dFu zTMsmujrQudh#S4A-@m#;%B(<&e~X!I(jo>m2%}w+TQJ%pU7{^=3qv+@BITWueV$P diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py index e8f20b5..c3eea7a 100644 --- a/app/utils/feature_engineering.py +++ b/app/utils/feature_engineering.py @@ -177,33 +177,6 @@ def generate_statistical_features(df, windows=[20,50,200], price_col='close', df_features[f'volume_skew_{window}'] = df[volume_col].rolling(window=window).skew() df_features[f'volume_kurt_{window}'] = df[volume_col].rolling(window=window).kurt() - # Price-volume correlations - df_features[f'price_volume_corr_{window}'] = ( - df[price_col].rolling(window=window) - .corr(df[volume_col])) - - # Higher-order moments of returns - returns = df[price_col].pct_change() - df_features[f'returns_skew_{window}'] = returns.rolling(window=window).skew() - df_features[f'returns_kurt_{window}'] = returns.rolling(window=window).kurt() - - # Cross-sectional statistics - df_features['price_acceleration'] = df[price_col].diff().diff() - df_features['returns_acceleration'] = df[price_col].pct_change().diff() - - # Advanced volatility estimators - df_features['parkinson_vol'] = np.sqrt( - 1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2)) - - df_features['garman_klass_vol'] = np.sqrt( - 0.5 * np.log(df[high_col]/df[low_col])**2 - - (2*np.log(2)-1) * np.log(df[price_col]/df['open'])**2 - ) - - # Dispersion measures - df_features['price_range'] = df[high_col] - df[low_col] - df_features['price_range_pct'] = df_features['price_range'] / df[price_col] - # Clean up any NaN values df_features = df_features.dropna()