backend/app/cron_ipo_news.py
2025-01-18 14:32:03 +01:00

99 lines
3.4 KiB
Python

import os
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv
def clean_link(url):
"""
Clean the article link to extract the actual URL if it's wrapped in a redirect.
"""
if 'url=' in url:
return url.split('url=')[-1]
return url
def main():
# Load environment variables
con = sqlite3.connect('stocks.db')
cursor = con.cursor()
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in cursor.fetchall()]
con.close()
load_dotenv()
url = os.getenv('IPO_NEWS') # IPO news URL
# Set up the WebDriver options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Initialize the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
json_file_path = 'json/market-news/ipo-news.json'
try:
# Fetch the website
driver.get(url)
# Wait for the page to load
driver.implicitly_wait(5)
# Find all the news containers
news_items = driver.find_elements(By.CSS_SELECTOR, ".gap-4.border-gray-300.bg-white.p-4.shadow.last\\:pb-1")
# Extract data from the containers
news_data = []
for item in news_items:
try:
# Extract relevant elements
title_element = item.find_element(By.CSS_SELECTOR, "h3 a")
description_element = item.find_element(By.CSS_SELECTOR, "p")
timestamp_element = item.find_element(By.CSS_SELECTOR, ".text-sm.text-faded")
stocks_element = item.find_elements(By.CSS_SELECTOR, ".ticker")
img_element = item.find_element(By.CSS_SELECTOR, "img.h-full.w-full.rounded.object-cover")
# Get element data
title = title_element.text
description = description_element.text
timestamp = timestamp_element.text
link = title_element.get_attribute("href")
stocks = [stock.text for stock in stocks_element]
img_link = img_element.get_attribute("src")
# Filter stocks that exist in the database
stock_list = [symbol for symbol in stocks if symbol in stock_symbols]
# Add to news data
news_data.append({
"title": title,
"description": description,
"timestamp": timestamp,
"link": clean_link(link),
"stocks": stock_list,
"img": img_link
})
except Exception as e:
print(f"Error extracting news item: {e}")
# Convert the data into a DataFrame
df = pd.DataFrame(news_data)
# Save the DataFrame to a JSON file
df.to_json(json_file_path, orient='records', indent=2)
finally:
# Ensure the WebDriver is closed
driver.quit()
if __name__ == '__main__':
main()