From 99e00425ac6d1411cbb7691a5f2f7fd5bae3a38c Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Sat, 5 Oct 2024 11:23:48 +0200 Subject: [PATCH] bugfixing options cron job --- app/cron_ai_score.py | 40 +++++++------- app/cron_historical_price.py | 3 +- app/cron_options_bubble.py | 8 +-- app/cron_options_gex.py | 13 +++-- .../__pycache__/score_model.cpython-310.pyc | Bin 3028 -> 3758 bytes app/ml_models/score_model.py | 49 ++++++++++++++---- 6 files changed, 70 insertions(+), 43 deletions(-) diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index 510359f..fb747ef 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -77,14 +77,14 @@ def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75) selected_features.append(feature) return selected_features -async def download_data(ticker, con, start_date, end_date): +async def download_data(ticker, con, start_date, end_date, skip_downloading): file_path = f"ml_models/training_data/ai-score/{ticker}.json" if os.path.exists(file_path): with open(file_path, 'rb') as file: return pd.DataFrame(orjson.loads(file.read())) - else: + elif skip_downloading == False: try: # Define paths to the statement files @@ -213,7 +213,7 @@ async def download_data(ticker, con, start_date, end_date): combined_data = sorted(combined_data, key=lambda x: x['date']) # Convert combined data into a DataFrame df_combined = pd.DataFrame(combined_data).dropna() - ''' + fundamental_columns = [ 'revenue', 'costOfRevenue', @@ -262,7 +262,7 @@ async def download_data(ticker, con, start_date, end_date): # Compute ratios for all combinations of key elements new_columns = {} # Loop over combinations of column pairs - for columns in [fundamental_columns]: + for columns in [fundamental_columns, stats_columns, ta_columns]: for num, denom in combinations(columns, 2): # Compute ratio and reverse ratio ratio = df_combined[num] / df_combined[denom] @@ -278,7 +278,7 @@ async def download_data(ticker, con, start_date, end_date): # Add all new columns to the original DataFrame at once df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1) - ''' + # To defragment the DataFrame, make a copy df_combined = df_combined.copy() df_combined = df_combined.dropna() @@ -301,7 +301,7 @@ async def download_data(ticker, con, start_date, end_date): pass -async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): +async def chunked_gather(tickers, con, start_date, end_date, skip_downloading, chunk_size=10): # Helper function to divide the tickers into chunks def chunks(lst, size): for i in range(0, len(lst), size): @@ -309,9 +309,9 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): results = [] - for chunk in chunks(tickers, chunk_size): + for chunk in tqdm(chunks(tickers, chunk_size)): # Create tasks for each chunk - tasks = [download_data(ticker, con, start_date, end_date) for ticker in chunk] + tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) # Accumulate the results @@ -319,14 +319,14 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): return results -async def warm_start_training(tickers, con): +async def warm_start_training(tickers, con, skip_downloading): start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d") df_train = pd.DataFrame() df_test = pd.DataFrame() test_size = 0.2 - dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) + dfs = await chunked_gather(tickers, con, start_date, end_date, skip_downloading, chunk_size=10) train_list = [] test_list = [] @@ -359,9 +359,9 @@ async def warm_start_training(tickers, con): return predictor -async def fine_tune_and_evaluate(ticker, con, start_date, end_date): +async def fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading): try: - df = await download_data(ticker,con, start_date, end_date) + df = await download_data(ticker,con, start_date, end_date, skip_downloading) if df is None or len(df) == 0: print(f"No data available for {ticker}") return @@ -371,7 +371,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date): train_data = df.iloc[:split_size] test_data = df.iloc[split_size:] - selected_features = top_uncorrelated_features(train_data,top_n=50) #[col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20) + selected_features = [col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20) # Fine-tune the model predictor = ScorePredictor() predictor.fine_tune_model(train_data[selected_features], train_data['Target']) @@ -380,9 +380,8 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date): data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) if len(data) != 0: - if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100: - res = {'score': data['score']} - await save_json(ticker, res) + if data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] > 50 and data['recall_score'] > 50 and data['roc_auc_score'] > 50: + await save_json(ticker, data) print(f"Saved results for {ticker}") gc.collect() except Exception as e: @@ -394,16 +393,17 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date): async def run(): train_mode = True # Set this to False for fine-tuning and evaluation + skip_downloading = False con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") if train_mode: # Warm start training - cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") + cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") warm_start_symbols = [row[0] for row in cursor.fetchall()] - print('Warm Start Training for:', warm_start_symbols) - predictor = await warm_start_training(warm_start_symbols, con) + print('Warm Start Training') + predictor = await warm_start_training(warm_start_symbols, con, skip_downloading) else: # Fine-tuning and evaluation for all stocks cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") @@ -416,7 +416,7 @@ async def run(): tasks = [] for ticker in tqdm(stock_symbols): - await fine_tune_and_evaluate(ticker, con, start_date, end_date) + await fine_tune_and_evaluate(ticker, con, start_date, end_date, skip_downloading) con.close() diff --git a/app/cron_historical_price.py b/app/cron_historical_price.py index c5244d1..1f1c9ac 100755 --- a/app/cron_historical_price.py +++ b/app/cron_historical_price.py @@ -84,6 +84,7 @@ async def get_historical_data(ticker, query_con, session): async def run(): total_symbols = [] + chunk_size = 400 try: cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") @@ -130,8 +131,6 @@ try: start_date_max = datetime(1970, 1, 1).strftime("%Y-%m-%d") end_date = end_date.strftime("%Y-%m-%d") - - chunk_size = 400 asyncio.run(run()) con.close() etf_con.close() diff --git a/app/cron_options_bubble.py b/app/cron_options_bubble.py index 6093c4c..10bd09c 100755 --- a/app/cron_options_bubble.py +++ b/app/cron_options_bubble.py @@ -42,7 +42,7 @@ def options_bubble_data(chunk): start_date_str = start_date.strftime('%Y-%m-%d') res_list = [] - for page in range(0, 500): + for page in range(0, 5000): try: data = fin.options_activity(company_tickers=company_tickers, page=page, pagesize=1000, date_from=start_date_str, date_to=end_date_str) data = ujson.loads(fin.output(data))['option_activity'] @@ -129,11 +129,11 @@ async def main(): print(len(total_symbols)) - chunk_size = len(total_symbols) // 1000 # Divide the list into N chunks + chunk_size = len(total_symbols) // 2000 # Divide the list into N chunks chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] - + print(chunks) loop = asyncio.get_running_loop() - with ThreadPoolExecutor(max_workers=2) as executor: + with ThreadPoolExecutor(max_workers=4) as executor: tasks = [loop.run_in_executor(executor, options_bubble_data, chunk) for chunk in chunks] for f in tqdm(asyncio.as_completed(tasks), total=len(tasks)): await f diff --git a/app/cron_options_gex.py b/app/cron_options_gex.py index a56b6d0..a5fbf6b 100644 --- a/app/cron_options_gex.py +++ b/app/cron_options_gex.py @@ -335,16 +335,14 @@ def get_options_chain(option_data_list): def get_data(ticker): res_list = [] - page = 0 - while True: + + for page in range(0,5000): try: data = fin.options_activity(date_from=start_date_str, date_to=end_date_str, company_tickers=ticker, page=page, pagesize=1000) data = ujson.loads(fin.output(data))['option_activity'] filtered_data = [{key: value for key, value in item.items() if key not in ['description_extended', 'updated']} for item in data] res_list += filtered_data - page += 1 - except Exception as e: - print(f"Error retrieving data for {ticker}: {e}") + except: break return res_list @@ -369,7 +367,7 @@ etf_cursor.execute("PRAGMA journal_mode = wal") etf_cursor.execute("SELECT DISTINCT symbol FROM etfs") etf_symbols = [row[0] for row in etf_cursor.fetchall()] -total_symbols = stock_symbols + etf_symbols +total_symbols = ['SPY'] #stock_symbols + etf_symbols query_template = """ SELECT date, close,change_percent @@ -385,7 +383,8 @@ for ticker in total_symbols: df_price = df_price.rename(columns={"change_percent": "changesPercentage"}) volatility = calculate_volatility(df_price) - + print(df_price) + print(volatility) ticker_data = get_data(ticker) # Group ticker_data by 'date' and collect all items for each date grouped_history = defaultdict(list) diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index 6b754b6d96b669d46787cfcd7dc46adb7a8c825d..0974412fbd2d2519905c40c315c69ffcf09a0e68 100644 GIT binary patch literal 3758 zcma)9ON<;x8SeM=ym!~Tey=m+0UAMeS%rXP4C_Z^U$RSNgJN1htEa1Hw!QrrSNF!A zVdlb`W5f|b63Hf~T)1=P3@0QcN*q9ufH_CvkVu5@ukP8$U=TIxzyA8`^}p+%wArjF zXz@Rl!Tn`L`IrVLn~uSCeA#aSn8MUZaa7rBj*ec7jM#9D*mO*_n75n?=z3(wRi`Rx zMpTRIPCafojkxJF?}*%idNz?&YAL@Rc8(7RHC!-S?8>z+0nW9 zymP*szu;Vud|Y%c%KnmbN%ohW%d%g0*6%2+%4$y(Ry)+3D@?tobn7$7q3m`wVKXle z@-PlWl^_eE-0P~MdH>GMTalM#VLuFb*A(p`5Bx9-)5OjEln0`Lu@^<<$nI~ISd*u| z>mB&zu;uyw0r&h-F~-r2oiN$)9^b=moU97{ty?$H8TmsNqp!1|?;S)r3;oAWM zgX{RRYXE_?fuk{%X-^c`L1zYfgPF`iZ?X!r(OZS|ut%LWFt%9}8fb~uJ(A-s9?(rG z_f;G@Z-iIy-NKiB4Iozzm5Ji2xxDRN6*%q#G;;HS#pIoIAFIA=f(qN?j@s_b#9op) zas4zNrb&?G*?23C+&G1*v&~0AIM~axO)q>^s&(^`$746kJ)XPe+{SP}3Ol=sGWhZ8 zukZYI{A7E!ic@FX0E1utg4%DkVIkZRWU!$JVKUK-0*@zQGH^KzEUK~h*k!>m-xIau zAa*hPPLPQuKaDak9!7!N8|6F*x;Cds5LPD5!|OEAhrTD)PH-lfcZVKM)1yI8wBEdP zbLZGygee!0QL`ZGf28miaLlLIHuusv*o?Wi7slacmZ$!H0xmaup1&U?Y||SKH;+{# z^-<_)V>lAF>xM~~yDkZzodvMfhH9yXsvZ68&y33W+-bXR%=h`^Jo834*@g`K$yWh# z<*ABKo2bvwKhV+ZQ=%9|nZn=?%%@g~QwXW!*Wo)uczcj#ot%5%E*W$XPF|-s>X80u z>?N!NxuOvb;OQIPiqMjwFdyv&h$0gqmGp(4cnJ*SC2pR&un#{EmPE4;d2w=*^-~^y zr12|6Rqso67NA3vHNL+>JuB_tX6}j40ZhSf@Yq(?Z2vtzlgIS_SQ+U z;c@?u$owVjg}@gHe2IV*w0a~YC$1OeTRq7rZj|DY(fC&Y#xMO3vF59N9V_RJ za0?$9UuKet@>Dr8vr0}d(Vl6llIt+P@<7cI(+{lokoOM@dWC6+DrI0eg1#%XN4@d6 zw-LD=8Mz(l{+$dV#x@X9U&cLr9YE-?sIZ2izaIs{UwUtG}6b2gn)j)Bc|fK<+pfE+6)rzp@0 zHT&k+%7B-zy6zYWIH{vIFF^&~QK2xNUsXn3lK=~=&jlOn4nHm#=Y5V3= zJvWe6On8*$;48<4RDS({9mw!mk5 z(tJ<(Kt@d7n$#zaL+uUahm9X-)An>}(wwySP5$9zY1)G8s1JXB%ssGDK42y1z)D_$ zmE28QZ0UUqvADcDqo@1t_|X2cJzoAUWvF{W-q{Iq9{SnUaUJ=qsAR5oW(>3sCfmo`ocwR_``=Of^LDx8PxFO;oG<*tHYa(ywHdw2vnl}!&KVUaGsH7zzr&d{#7&P zYc zkmY$6Qj8QuuN_fkKIp~#5}id)g{Y%kjsh7-8E+D$;$@@64^tpvx|ioe1XP&z!zjqg s_lo{iNuf_DN$RXa`|WFm(!Wh%o>7HG>RN(n8|r0z4Z<{n^WTd0Z!%8z6#xJL delta 1395 zcmZuxOKjXk7@iqF)@yq=VY}H!9tDXK*ewZ=&>#?kLo0yEX0{`z7O4VY^-^_k9^S|c%_x8%K z?RrwH*#w^L2HEL2NA+cT`{?ZOjmDrs2`9V~ofY zjWe!sog0AWrv`93Cvc#0QN?WYVEQ8*nPR*6$m9%F_G+LNx;739h3m^Kj+#)k? zlyDJwV?W!KZ5=lmCDWsYo{DJr9g&ObiuT%41J+Umi<*?(^SXblyV?d#)j!%6y{IC6 z^->G%)P`Z^We`p}OysD?WEWX_SuVmjgC%6<#!`%B5{NYQxS#neK&BQzi`vwsI=yyU z{a`Gr-;9qp&fw7wp3=a2VUEM#Y9tDs?~TTVo{U8-+qi0sWf*5YO?K6h**bHd$5Z#D z4*`Z)y+_s0=H0Ng-$N&8Mm~}!fPc0i)v(1``34P!Oexzi;Ro0EwH_TAjRL2j|qo0D5P$oT?!=Q;On zHgPAl+|J#rh8*X$i9<Us5o{XE@NnZ5p$oP(66RRp}q zX$1jOof-%zv#Ex_0OW@H#eQN{D=a?<_M{)|7gdl%5T;=g7e*Q+QuJIoyLz?m^l4x1 zIuFm+XM!>>dl__w+)|%A@2WMoe}MHYs-S0)?j{j0j1T;1Puzb!b%8WA$?SGE;ZXWC z0BDZQ(>k@8O)b`;9onQ8zDx(U!7NauPT8@o+t38&@wJt0jPQ~aJPe>Y*VXUtJhjwc z?qdhbm|hw98qS_Ucnje~kmU^fK-K^rg0Iww1#MM)r))6aO7%{ZMzQiUf)VHEQoj hDy$blx_gm$iGq>zGynz$=|N*I!{;)0R-6(W>|an