backend/app/cron_ipo_news.py

import os
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv

def clean_link(url):
    """
    Clean the article link to extract the actual URL if it's wrapped in a redirect.
    """
    if 'url=' in url:
        return url.split('url=')[-1]
    return url

def main():
    # Load environment variables
    con = sqlite3.connect('stocks.db')
    cursor = con.cursor()
    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
    stock_symbols = [row[0] for row in cursor.fetchall()]
    con.close()

    load_dotenv()
    url = os.getenv('IPO_NEWS')  # IPO news URL

    # Set up the WebDriver options
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    json_file_path = 'json/market-news/ipo-news.json'

    try:
        # Fetch the website
        driver.get(url)
        # Wait for the page to load
        driver.implicitly_wait(5)

        # Find all the news containers
        news_items = driver.find_elements(By.CSS_SELECTOR, ".gap-4.border-gray-300.bg-white.p-4.shadow.last\\:pb-1")

        # Extract data from the containers
        news_data = []
        for item in news_items:
            try:
                # Extract relevant elements
                title_element = item.find_element(By.CSS_SELECTOR, "h3 a")
                description_element = item.find_element(By.CSS_SELECTOR, "p")
                timestamp_element = item.find_element(By.CSS_SELECTOR, ".text-sm.text-faded")
                stocks_element = item.find_elements(By.CSS_SELECTOR, ".ticker")
                img_element = item.find_element(By.CSS_SELECTOR, "img.h-full.w-full.rounded.object-cover")

                # Get element data
                title = title_element.text
                description = description_element.text
                timestamp = timestamp_element.text
                link = title_element.get_attribute("href")
                stocks = [stock.text for stock in stocks_element]
                img_link = img_element.get_attribute("src")

                # Filter stocks that exist in the database
                stock_list = [symbol for symbol in stocks if symbol in stock_symbols]

                # Add to news data
                news_data.append({
                    "title": title,
                    "description": description,
                    "timestamp": timestamp,
                    "link": clean_link(link),
                    "stocks": stock_list,
                    "img": img_link
                })

            except Exception as e:
                print(f"Error extracting news item: {e}")

        # Convert the data into a DataFrame
        df = pd.DataFrame(news_data)

        # Save the DataFrame to a JSON file
        df.to_json(json_file_path, orient='records', indent=2)

    finally:
        # Ensure the WebDriver is closed
        driver.quit()


if __name__ == '__main__':
    main()