diff --git a/app/cron_ipo_news.py b/app/cron_ipo_news.py index 86793cf..0b0de5b 100644 --- a/app/cron_ipo_news.py +++ b/app/cron_ipo_news.py @@ -1,17 +1,17 @@ import os import pandas as pd -import ujson -import orjson +import sqlite3 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service -from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager from dotenv import load_dotenv -import sqlite3 -from datetime import datetime def clean_link(url): + """ + Clean the article link to extract the actual URL if it's wrapped in a redirect. + """ if 'url=' in url: return url.split('url=')[-1] return url @@ -25,7 +25,7 @@ def main(): con.close() load_dotenv() - url = os.getenv('IPO_NEWS') # IPO news URL + url = os.getenv('IPO_NEWS') # IPO news URL # Set up the WebDriver options options = Options() @@ -42,8 +42,9 @@ def main(): try: # Fetch the website driver.get(url) - # Wait for the page to load (if needed, adjust the time) + # Wait for the page to load driver.implicitly_wait(5) + # Find all the news containers news_items = driver.find_elements(By.CSS_SELECTOR, ".gap-4.border-gray-300.bg-white.p-4.shadow.last\\:pb-1") @@ -51,28 +52,32 @@ def main(): news_data = [] for item in news_items: try: + # Extract relevant elements title_element = item.find_element(By.CSS_SELECTOR, "h3 a") description_element = item.find_element(By.CSS_SELECTOR, "p") timestamp_element = item.find_element(By.CSS_SELECTOR, ".text-sm.text-faded") stocks_element = item.find_elements(By.CSS_SELECTOR, ".ticker") + img_element = item.find_element(By.CSS_SELECTOR, "img.h-full.w-full.rounded.object-cover") + # Get element data title = title_element.text description = description_element.text timestamp = timestamp_element.text link = title_element.get_attribute("href") stocks = [stock.text for stock in stocks_element] + img_link = img_element.get_attribute("src") - stock_list = [] - for symbol in stocks: - if symbol in stock_symbols: - stock_list.append(symbol) + # Filter stocks that exist in the database + stock_list = [symbol for symbol in stocks if symbol in stock_symbols] + # Add to news data news_data.append({ "title": title, "description": description, "timestamp": timestamp, "link": clean_link(link), - "stocks": stock_list + "stocks": stock_list, + "img": img_link }) except Exception as e: @@ -80,12 +85,10 @@ def main(): # Convert the data into a DataFrame df = pd.DataFrame(news_data) - print(df) # Save the DataFrame to a JSON file df.to_json(json_file_path, orient='records', indent=2) - finally: # Ensure the WebDriver is closed driver.quit() diff --git a/app/primary_cron_job.py b/app/primary_cron_job.py index 5af27ee..37b611a 100755 --- a/app/primary_cron_job.py +++ b/app/primary_cron_job.py @@ -180,6 +180,7 @@ def run_cron_market_news(): week = datetime.today().weekday() if week <= 4: run_command(["python3", "cron_market_news.py"]) + run_command(["python3", "cron_ipo_news.py"]) def run_company_news(): week = datetime.today().weekday()