bugfixing ipo news

This commit is contained in:
MuslemRahimi 2025-04-06 12:54:45 +02:00
parent db3b406646
commit cfe08096ff

View File

@ -17,15 +17,17 @@ def clean_link(url):
return url return url
def main(): def main():
# Load environment variables # Load stock symbols from the database
con = sqlite3.connect('stocks.db') con = sqlite3.connect('stocks.db')
cursor = con.cursor() cursor = con.cursor()
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in cursor.fetchall()] stock_symbols = [row[0] for row in cursor.fetchall()]
con.close() con.close()
# Optionally load environment variables; you may also hardcode the URL below.
load_dotenv() load_dotenv()
url = os.getenv('IPO_NEWS') # IPO news URL # Use the correct URL for scraping IPO news:
url = os.getenv('IPO_NEWS', 'https://stockanalysis.com/ipos/news/')
# Set up the WebDriver options # Set up the WebDriver options
options = Options() options = Options()
@ -42,35 +44,39 @@ def main():
try: try:
# Fetch the website # Fetch the website
driver.get(url) driver.get(url)
# Wait for the page to load driver.implicitly_wait(20)
driver.implicitly_wait(5)
# Updated selector for news containers # Use a flexible selector for news containers
news_items = driver.find_elements(By.CSS_SELECTOR, "div.gap-4.border-gray-300.bg-white.p-4.shadow") news_items = driver.find_elements(
By.CSS_SELECTOR,
"div[class*='border-gray-300'][class*='bg-white'][class*='p-4']"
)
# Extract data from the containers
news_data = [] news_data = []
for item in news_items: for item in news_items:
try: try:
# Updated selectors # Extract elements using flexible selectors
title_element = item.find_element(By.CSS_SELECTOR, "h3 a") title_element = item.find_element(By.CSS_SELECTOR, "h3 a")
description_element = item.find_element(By.CSS_SELECTOR, "p.overflow-auto") description_element = item.find_element(By.CSS_SELECTOR, "p.overflow-auto")
timestamp_element = item.find_element(By.CSS_SELECTOR, "div.text-sm.text-faded") timestamp_element = item.find_element(By.CSS_SELECTOR, "div.text-sm.text-faded")
stocks_element = item.find_elements(By.CSS_SELECTOR, "a.ticker") stocks_elements = item.find_elements(By.CSS_SELECTOR, "a.ticker")
img_element = item.find_element(By.CSS_SELECTOR, "img.w-full.rounded.object-cover") img_element = item.find_element(By.CSS_SELECTOR, "img.w-full.rounded.object-cover")
# Get element data # Use textContent and strip whitespace
title = title_element.text title = title_element.get_attribute("textContent").strip()
description = description_element.text description = description_element.get_attribute("textContent").strip()
timestamp = timestamp_element.text timestamp = timestamp_element.get_attribute("textContent").strip()
link = title_element.get_attribute("href") link = title_element.get_attribute("href")
stocks = [stock.text for stock in stocks_element] stocks = [stock.text.strip() for stock in stocks_elements]
img_link = img_element.get_attribute("src") img_link = img_element.get_attribute("src")
# Filter stocks that exist in the database # Skip the news item if the title is empty
if not title:
continue
# Filter stocks that exist in your database
stock_list = [symbol for symbol in stocks if symbol in stock_symbols] stock_list = [symbol for symbol in stocks if symbol in stock_symbols]
# Add to news data
news_data.append({ news_data.append({
"title": title, "title": title,
"description": description, "description": description,
@ -83,17 +89,14 @@ def main():
except Exception as e: except Exception as e:
print(f"Error extracting news item: {e}") print(f"Error extracting news item: {e}")
# Convert the data into a DataFrame # Convert the collected data into a DataFrame and save it to a JSON file if not empty
df = pd.DataFrame(news_data) df = pd.DataFrame(news_data)
# Save the DataFrame to a JSON file
if not df.empty: if not df.empty:
df.to_json(json_file_path, orient='records', indent=2) df.to_json(json_file_path, orient='records', indent=2)
else:
print("No news items were found.")
finally: finally:
# Ensure the WebDriver is closed
driver.quit() driver.quit()
if __name__ == '__main__': if __name__ == '__main__':
main() main()