backend/app/cron_ipo_news.py

import os
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv

def clean_link(url):
    """
    Clean the article link to extract the actual URL if it's wrapped in a redirect.
    """
    if 'url=' in url:
        return url.split('url=')[-1]
    return url

def main():
    # Load stock symbols from the database
    con = sqlite3.connect('stocks.db')
    cursor = con.cursor()
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
    stock_symbols = [row[0] for row in cursor.fetchall()]
    con.close()

    # Optionally load environment variables; you may also hardcode the URL below.
    load_dotenv()
    # Use the correct URL for scraping IPO news:
    url = os.getenv('IPO_NEWS', 'https://stockanalysis.com/ipos/news/')

    # Set up the WebDriver options
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    json_file_path = 'json/market-news/ipo-news.json'

    try:
        # Fetch the website
        driver.get(url)
        driver.implicitly_wait(20)

        # Use a flexible selector for news containers
        news_items = driver.find_elements(
            By.CSS_SELECTOR,
            "div[class*='border-gray-300'][class*='bg-white'][class*='p-4']"
        )

        news_data = []
        for item in news_items:
            try:
                # Extract elements using flexible selectors
                title_element = item.find_element(By.CSS_SELECTOR, "h3 a")
                description_element = item.find_element(By.CSS_SELECTOR, "p.overflow-auto")
                timestamp_element = item.find_element(By.CSS_SELECTOR, "div.text-sm.text-faded")
                stocks_elements = item.find_elements(By.CSS_SELECTOR, "a.ticker")
                img_element = item.find_element(By.CSS_SELECTOR, "img.w-full.rounded.object-cover")

                # Use textContent and strip whitespace
                title = title_element.get_attribute("textContent").strip()
                description = description_element.get_attribute("textContent").strip()
                timestamp = timestamp_element.get_attribute("textContent").strip()
                link = title_element.get_attribute("href")
                stocks = [stock.text.strip() for stock in stocks_elements]
                img_link = img_element.get_attribute("src")

                # Skip the news item if the title is empty
                if not title:
                    continue

                # Filter stocks that exist in your database
                stock_list = [symbol for symbol in stocks if symbol in stock_symbols]

                news_data.append({
                    "title": title,
                    "description": description,
                    "timestamp": timestamp,
                    "link": clean_link(link),
                    "stocks": stock_list,
                    "img": img_link
                })

            except Exception as e:
                print(f"Error extracting news item: {e}")

        # Convert the collected data into a DataFrame and save it to a JSON file if not empty
        df = pd.DataFrame(news_data)
        if not df.empty:
            df.to_json(json_file_path, orient='records', indent=2)
        else:
            print("No news items were found.")
    finally:
        driver.quit()

if __name__ == '__main__':
    main()