From 608dbeeb38e571fcdd65b2b9c5c305848a44bc79 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Sat, 25 Jan 2025 21:12:17 +0100 Subject: [PATCH] bugfixing --- app/cron_fda_calendar.py | 134 +++++++++++++++++++++++---------------- app/primary_cron_job.py | 7 +- 2 files changed, 85 insertions(+), 56 deletions(-) diff --git a/app/cron_fda_calendar.py b/app/cron_fda_calendar.py index c180711..99d5a1f 100644 --- a/app/cron_fda_calendar.py +++ b/app/cron_fda_calendar.py @@ -1,72 +1,100 @@ -import ujson -import asyncio -import aiohttp -import os +import json +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC import sqlite3 -from tqdm import tqdm +import ujson from dotenv import load_dotenv -from datetime import datetime -import requests +import os -# Load environment variables load_dotenv() -today = datetime.today().date() +url = os.getenv('FDA_CALENDAR') -api_key = os.getenv('UNUSUAL_WHALES_API_KEY') - -url = "https://api.unusualwhales.com/api/market/fda-calendar" - -headers = { - "Accept": "application/json, text/plain", - "Authorization": api_key -} - - - -async def save_json(data): - with open(f"json/fda-calendar/data.json", 'w') as file: - ujson.dump(data, file) - - -async def get_data(): +def save_json(data): + with open(f"json/fda-calendar/data.json", 'wb') as file: + ujson.dumps(data, file) +def main(): + # Set up Chrome options + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + + # Initialize WebDriver + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=chrome_options) + + # Connect to the database to get stock symbols con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") cursor.execute("SELECT DISTINCT symbol FROM stocks") - stock_symbols = [row[0] for row in cursor.fetchall()] + stock_symbols = [row[0].strip() for row in cursor.fetchall()] # Ensure symbols are stripped con.close() + try: - response = requests.get(url, headers=headers) - data = response.json()['data'] - data = [ - entry for entry in data - if datetime.strptime(entry['start_date'], '%Y-%m-%d').date() >= today + # Navigate to FDA calendar + driver.get(url) + + # Wait for the table to load + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CSS_SELECTOR, "table.flow-full-table")) + ) + + # Extract table data + entries = [] + rows = driver.find_elements(By.CSS_SELECTOR, "table.flow-full-table tbody tr") + + for row in rows: + cols = row.find_elements(By.TAG_NAME, "td") + if len(cols) >=6: # Check for minimum required columns + try: + # Extract ticker from the anchor tag, stripping whitespace + ticker_element = cols[0].find_element(By.TAG_NAME, "a") + ticker = ticker_element.text.strip() if ticker_element else "" + ticker = ticker or None # Set to None if empty after strip + except: + ticker = None # If no anchor tag found + + + # Extract other fields, converting empty strings to None + date = cols[1].text.strip() or None + drug = cols[2].text.strip() or None + indication = cols[3].text.strip() or None + status = cols[4].text.strip() or None + description = cols[5].text.strip() or None + + entry = { + "ticker": ticker, + "date": date, + "drug": drug, + "indication": indication, + "status": status, + "description": description + } + entries.append(entry) + + # Filter entries to include only those with tickers present in the database + filtered_entries = [ + entry for entry in entries + if entry['ticker'] is not None and entry['ticker'] in stock_symbols ] - res_list = [] - for item in data: - try: - symbol = item['ticker'] - if symbol in stock_symbols: - res_list.append({**item}) - except: - pass + + if filtered_entries: + save_json(filtered_entries) + print("Successfully scraped FDA calendar data") - return data except Exception as e: - print(f"Error fetching data: {e}") - return [] - - -async def run(): - data = await get_data() - if len(data) > 0: - await save_json(data) + print(f"Error during scraping: {str(e)}") + finally: + driver.quit() if __name__ == "__main__": - try: - asyncio.run(run()) - except Exception as e: - print(f"An error occurred: {e}") \ No newline at end of file + main() \ No newline at end of file diff --git a/app/primary_cron_job.py b/app/primary_cron_job.py index 221367e..7c98d88 100755 --- a/app/primary_cron_job.py +++ b/app/primary_cron_job.py @@ -110,8 +110,7 @@ def run_options_jobs(): def run_fda_calendar(): now = datetime.now(ny_tz) week = now.weekday() - hour = now.hour - if week <= 4 and 8 <= hour < 20: + if week <= 5: run_command(["python3", "cron_fda_calendar.py"]) def run_cron_insider_trading(): @@ -378,6 +377,9 @@ schedule.every().day.at("08:00").do(run_threaded, run_cron_insider_trading).tag( schedule.every().day.at("08:30").do(run_threaded, run_dividends).tag('dividends_job') schedule.every().day.at("09:00").do(run_threaded, run_shareholders).tag('shareholders_job') schedule.every().day.at("09:30").do(run_threaded, run_profile).tag('profile_job') +schedule.every().day.at("10:00").do(run_threaded, run_fda_calendar).tag('fda_job') + + #schedule.every().day.at("10:30").do(run_threaded, run_sec_filings).tag('sec_filings_job') #schedule.every().day.at("11:00").do(run_threaded, run_executive).tag('executive_job') @@ -420,7 +422,6 @@ schedule.every(2).hours.do(run_threaded, run_analyst_rating).tag('analyst_job') schedule.every(1).hours.do(run_threaded, run_company_news).tag('company_news_job') schedule.every(3).hours.do(run_threaded, run_press_releases).tag('press_release_job') -#schedule.every(1).hours.do(run_threaded, run_fda_calendar).tag('fda_calendar_job') schedule.every(20).minutes.do(run_threaded, run_options_stats).tag('options_stats_job')