ipo news

2025-01-18 11:55:31 +01:00 · 2025-01-18 11:55:31 +01:00 · bd9e48db03
commit bd9e48db03
parent e9d9997719
2 changed files with 95 additions and 77 deletions
--- a/app/cron_ipo_news.py
+++ b/app/cron_ipo_news.py
@ -0,0 +1,95 @@
+import os
+import pandas as pd
+import ujson
+import orjson
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+from dotenv import load_dotenv
+import sqlite3
+from datetime import datetime
+
+def clean_link(url):
+    if 'url=' in url:
+        return url.split('url=')[-1]
+    return url
+
+def main():
+    # Load environment variables
+    con = sqlite3.connect('stocks.db')
+    cursor = con.cursor()
+    cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
+    stock_symbols = [row[0] for row in cursor.fetchall()]
+    con.close()
+
+    load_dotenv()
+    url = os.getenv('IPO_NEWS') # IPO news URL
+
+    # Set up the WebDriver options
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+
+    # Initialize the WebDriver
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service, options=options)
+
+    json_file_path = 'json/market-news/ipo-news.json'
+
+    try:
+        # Fetch the website
+        driver.get(url)
+        # Wait for the page to load (if needed, adjust the time)
+        driver.implicitly_wait(5)
+        # Find all the news containers
+        news_items = driver.find_elements(By.CSS_SELECTOR, ".gap-4.border-gray-300.bg-white.p-4.shadow.last\\:pb-1")
+
+        # Extract data from the containers
+        news_data = []
+        for item in news_items:
+            try:
+                title_element = item.find_element(By.CSS_SELECTOR, "h3 a")
+                description_element = item.find_element(By.CSS_SELECTOR, "p")
+                timestamp_element = item.find_element(By.CSS_SELECTOR, ".text-sm.text-faded")
+                stocks_element = item.find_elements(By.CSS_SELECTOR, ".ticker")
+
+                title = title_element.text
+                description = description_element.text
+                timestamp = timestamp_element.text
+                link = title_element.get_attribute("href")
+                stocks = [stock.text for stock in stocks_element]
+
+                stock_list = []
+                for symbol in stocks:
+                    if symbol in stock_symbols:
+                        stock_list.append(symbol)
+
+                news_data.append({
+                    "title": title,
+                    "description": description,
+                    "timestamp": timestamp,
+                    "link": clean_link(link),
+                    "stocks": stock_list
+                })
+
+            except Exception as e:
+                print(f"Error extracting news item: {e}")
+
+        # Convert the data into a DataFrame
+        df = pd.DataFrame(news_data)
+        print(df)
+
+        # Save the DataFrame to a JSON file
+        df.to_json(json_file_path, orient='records', indent=2)
+
+
+    finally:
+        # Ensure the WebDriver is closed
+        driver.quit()
+
+
+if __name__ == '__main__':
+    main()
--- a/app/test.py
+++ b/app/test.py
@ -1,77 +0,0 @@
-import aiohttp
-import aiofiles
-import ujson
-import sqlite3
-import pandas as pd
-import asyncio
-import pytz
-import time
-import os
-from dotenv import load_dotenv
-from datetime import datetime, timedelta
-from tqdm import tqdm
-import pytz
-
-
-date_format = "%a, %d %b %Y %H:%M:%S %z"
-
-load_dotenv()
-api_key = os.getenv('BENZINGA_API_KEY')
-
-headers = {"accept": "application/json"}
-
-async def get_latest_wiim(session):
-    url = "https://api.benzinga.com/api/v2/news"
-    querystring = {"token": api_key,"dateFrom":"2025-01-16","dateTo":"2025-01-17","sort":"created:desc", "pageSize": 1000, "channels":"WIIM"}
-
-    try:
-        async with session.get(url, params=querystring, headers=headers) as response:
-            res_list = []
-            data = ujson.loads(await response.text())
-
-            for item in data:
-                try:
-                    if len(item['stocks']) ==1:
-                        item['ticker'] = item['stocks'][0].get('name',None)
-
-                        with open(f"json/quote/{item['ticker']}.json","r") as file:
-                            quote_data = ujson.load(file)
-                            item['marketCap'] = quote_data.get('marketCap',None)
-                        
-                        res_list.append({'date': item['created'], 'text': item['title'], 'marketCap': item['marketCap'],'ticker': item['ticker']})
-                except:
-                    pass
-            res_list = sorted(
-                res_list,
-                key=lambda item: (item['marketCap'], datetime.strptime(item['date'], '%a, %d %b %Y %H:%M:%S %z')),
-                reverse=True
-            )
-
-            print(res_list[:10])
-
-            '''
-            for item in res:
-                for el in item['stocks']:
-                    # Update the 'name' key to 'ticker'
-                    if 'name' in el:
-                        el['ticker'] = el.pop('name')
-                        if el['ticker'] in stock_symbols:
-                            el['assetType'] = 'stock'
-                        elif el['ticker'] in etf_symbols:
-                            el['assetType'] = 'etf'
-                res_list.append({'date': item['created'], 'text': item['title'], 'stocks': item['stocks']})
-            with open(f"json/wiim/rss-feed/data.json", 'w') as file:
-                    ujson.dump(res_list, file)
-            '''
-
-    except Exception as e:
-        print(e)
-
-async def run():
-    async with aiohttp.ClientSession() as session:
-        await get_latest_wiim(session)
-
-try:
-    asyncio.run(run())
-except Exception as e:
-    print(e)