backend/app/cron_fda_calendar.py

import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
import ujson
from dotenv import load_dotenv
import os

load_dotenv()

url = os.getenv('FDA_CALENDAR')

def save_json(data):
    with open("json/fda-calendar/data.json", 'w', encoding='utf-8') as file:
        ujson.dump(data, file, ensure_ascii=False, indent=2)

def main():
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Initialize WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Connect to the database to get stock symbols
    con = sqlite3.connect('stocks.db')
    cursor = con.cursor()
    cursor.execute("PRAGMA journal_mode = wal")
    cursor.execute("SELECT DISTINCT symbol FROM stocks")
    stock_symbols = [row[0].strip() for row in cursor.fetchall()]  # Ensure symbols are stripped
    con.close()

    try:
        # Navigate to FDA calendar
        driver.get(url)

        # Wait for the table to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.flow-full-table"))
        )

        # Extract table data
        entries = []
        rows = driver.find_elements(By.CSS_SELECTOR, "table.flow-full-table tbody tr")

        for row in rows:
            cols = row.find_elements(By.TAG_NAME, "td")
            if len(cols) >=6:  # Check for minimum required columns
                try:
                    # Extract ticker from the anchor tag, stripping whitespace
                    ticker_element = cols[0].find_element(By.TAG_NAME, "a")
                    ticker = ticker_element.text.strip() if ticker_element else ""
                    ticker = ticker or None  # Set to None if empty after strip
                except:
                    ticker = None  # If no anchor tag found


                # Extract other fields, converting empty strings to None
                date = cols[1].text.strip() or None
                drug = cols[2].text.strip() or None
                indication = cols[3].text.strip() or None
                status = cols[4].text.strip() or None
                description = cols[5].text.strip() or None

                entry = {
                    "ticker": ticker,
                    "date": date,
                    "drug": drug,
                    "indication": indication,
                    "status": status,
                    "description": description
                }
                entries.append(entry)

        # Filter entries to include only those with tickers present in the database
        filtered_entries = [
            entry for entry in entries
            if entry['ticker'] is not None and entry['ticker'] in stock_symbols
        ]


        if filtered_entries:
            save_json(filtered_entries)
            print("Successfully scraped FDA calendar data")

    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()