backend/app/cron_fda_calendar.py
MuslemRahimi 0fccd303de bugfixing
2025-01-25 21:14:17 +01:00

100 lines
3.6 KiB
Python

import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
import ujson
from dotenv import load_dotenv
import os
load_dotenv()
url = os.getenv('FDA_CALENDAR')
def save_json(data):
with open("json/fda-calendar/data.json", 'w', encoding='utf-8') as file:
ujson.dump(data, file, ensure_ascii=False, indent=2)
def main():
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
# Connect to the database to get stock symbols
con = sqlite3.connect('stocks.db')
cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal")
cursor.execute("SELECT DISTINCT symbol FROM stocks")
stock_symbols = [row[0].strip() for row in cursor.fetchall()] # Ensure symbols are stripped
con.close()
try:
# Navigate to FDA calendar
driver.get(url)
# Wait for the table to load
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table.flow-full-table"))
)
# Extract table data
entries = []
rows = driver.find_elements(By.CSS_SELECTOR, "table.flow-full-table tbody tr")
for row in rows:
cols = row.find_elements(By.TAG_NAME, "td")
if len(cols) >=6: # Check for minimum required columns
try:
# Extract ticker from the anchor tag, stripping whitespace
ticker_element = cols[0].find_element(By.TAG_NAME, "a")
ticker = ticker_element.text.strip() if ticker_element else ""
ticker = ticker or None # Set to None if empty after strip
except:
ticker = None # If no anchor tag found
# Extract other fields, converting empty strings to None
date = cols[1].text.strip() or None
drug = cols[2].text.strip() or None
indication = cols[3].text.strip() or None
status = cols[4].text.strip() or None
description = cols[5].text.strip() or None
entry = {
"ticker": ticker,
"date": date,
"drug": drug,
"indication": indication,
"status": status,
"description": description
}
entries.append(entry)
# Filter entries to include only those with tickers present in the database
filtered_entries = [
entry for entry in entries
if entry['ticker'] is not None and entry['ticker'] in stock_symbols
]
if filtered_entries:
save_json(filtered_entries)
print("Successfully scraped FDA calendar data")
except Exception as e:
print(f"Error during scraping: {str(e)}")
finally:
driver.quit()
if __name__ == "__main__":
main()