bugfixing company_news

2024-10-19 15:14:44 +02:00 · 2024-10-19 15:14:44 +02:00 · c7cad764b3
commit c7cad764b3
parent b062213410
2 changed files with 152 additions and 36 deletions
--- a/app/cron_business_metrics.py
+++ b/app/cron_business_metrics.py
@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""
+
+SEC Filing Scraper
+@author: AdamGetbags
+
+"""
+
+# import modules
+import requests
+import pandas as pd
+
+# create request header
+headers = {'User-Agent': "email@address.com"}
+
+# get all companies data
+companyTickers = requests.get(
+    "https://www.sec.gov/files/company_tickers.json",
+    headers=headers
+    )
+
+# review response / keys
+print(companyTickers.json().keys())
+
+# format response to dictionary and get first key/value
+firstEntry = companyTickers.json()['0']
+
+# parse CIK // without leading zeros
+directCik = companyTickers.json()['0']['cik_str']
+
+# dictionary to dataframe
+companyData = pd.DataFrame.from_dict(companyTickers.json(),
+                                     orient='index')
+
+# add leading zeros to CIK
+companyData['cik_str'] = companyData['cik_str'].astype(
+                           str).str.zfill(10)
+
+# review data
+print(companyData[:1])
+
+cik = companyData[0:1].cik_str[0]
+
+# get company specific filing metadata
+filingMetadata = requests.get(
+    f'https://data.sec.gov/submissions/CIK{cik}.json',
+    headers=headers
+    )
+
+# review json 
+print(filingMetadata.json().keys())
+filingMetadata.json()['filings']
+filingMetadata.json()['filings'].keys()
+filingMetadata.json()['filings']['recent']
+filingMetadata.json()['filings']['recent'].keys()
+
+# dictionary to dataframe
+allForms = pd.DataFrame.from_dict(
+             filingMetadata.json()['filings']['recent']
+             )
+
+# review columns
+allForms.columns
+allForms[['accessionNumber', 'reportDate', 'form']].head(50)
+
+# 10-Q metadata
+allForms.iloc[11]
+
+# get company facts data
+companyFacts = requests.get(
+    f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
+    headers=headers
+    )
+
+#review data
+companyFacts.json().keys()
+companyFacts.json()['facts']
+companyFacts.json()['facts'].keys()
+
+# filing metadata
+companyFacts.json()['facts']['dei'][
+    'EntityCommonStockSharesOutstanding']
+companyFacts.json()['facts']['dei'][
+    'EntityCommonStockSharesOutstanding'].keys()
+companyFacts.json()['facts']['dei'][
+    'EntityCommonStockSharesOutstanding']['units']
+companyFacts.json()['facts']['dei'][
+    'EntityCommonStockSharesOutstanding']['units']['shares']
+companyFacts.json()['facts']['dei'][
+    'EntityCommonStockSharesOutstanding']['units']['shares'][0]
+
+# concept data // financial statement line items
+companyFacts.json()['facts']['us-gaap']
+companyFacts.json()['facts']['us-gaap'].keys()
+
+# different amounts of data available per concept
+companyFacts.json()['facts']['us-gaap']['AccountsPayable']
+companyFacts.json()['facts']['us-gaap']['Revenues']
+companyFacts.json()['facts']['us-gaap']['Assets']
+
+# get company concept data
+companyConcept = requests.get(
+    (
+    f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}'
+     f'/us-gaap/Assets.json'
+    ),
+    headers=headers
+    )
+
+# review data
+companyConcept.json().keys()
+companyConcept.json()['units']
+companyConcept.json()['units'].keys()
+companyConcept.json()['units']['USD']
+companyConcept.json()['units']['USD'][0]
+
+# parse assets from single filing
+companyConcept.json()['units']['USD'][0]['val']
+
+# get all filings data 
+assetsData = pd.DataFrame.from_dict((
+               companyConcept.json()['units']['USD']))
+
+# review data
+assetsData.columns
+assetsData.form
+
+# get assets from 10Q forms and reset index
+assets10Q = assetsData[assetsData.form == '10-Q']
+assets10Q = assets10Q.reset_index(drop=True)
+
+print(assets10Q)
--- a/app/cron_company_news.py
+++ b/app/cron_company_news.py
@ -37,45 +37,19 @@ async def filter_and_deduplicate(data, excluded_domains=None, deduplicate_key='t
    
    return filtered_data

-async def fetch_news(session, url):
-    async with session.get(url) as response:
-        return await response.json()
-
-async def save_news(data, symbol):
-    #os.makedirs("json/market-news/companies", exist_ok=True)
+async def save_quote_as_json(symbol, data):
    with open(f"json/market-news/companies/{symbol}.json", 'w') as file:
        ujson.dump(data, file)

-async def process_symbols(symbols):
-    limit = 200
-    chunk_size = 50  # Adjust this value based on API limitations
-    
+async def get_data(chunk):
+    company_tickers = ','.join(chunk)
    async with aiohttp.ClientSession() as session:
-        for i in tqdm(range(0, len(symbols), chunk_size)):
-            chunk = symbols[i:i+chunk_size]
-            company_tickers = ','.join(chunk)
-            url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&limit={limit}&apikey={api_key}'
-            
-            data = await fetch_news(session, url)
-
-            custom_domains = ['prnewswire.com', 'globenewswire.com', 'accesswire.com']
-            data = await filter_and_deduplicate(data, excluded_domains=custom_domains)
-
-            grouped_data = {}
-            for item in data:
-                symbol = item['symbol']
-                if symbol in chunk:
-                    if symbol not in grouped_data:
-                        grouped_data[symbol] = []
-                    grouped_data[symbol].append(item)
-            
-            # Save the filtered data for each symbol in the chunk
-            tasks = []
-            for symbol in chunk:
-                filtered_data = grouped_data.get(symbol, [])
-                tasks.append(save_news(filtered_data, symbol))
-            
-            await asyncio.gather(*tasks)
+        url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&page=0&limit=2000&apikey={api_key}'
+        async with session.get(url) as response:
+            if response.status == 200:
+                return await response.json()
+            else:
+                return []

 def get_symbols(db_name, table_name):
    with sqlite3.connect(db_name) as con:
@ -90,7 +64,17 @@ async def main():
    crypto_symbols = get_symbols('crypto.db', 'cryptos')
    total_symbols = stock_symbols + etf_symbols + crypto_symbols

-    await process_symbols(total_symbols)
+    chunk_size = len(total_symbols) // 70  # Divide the list into N chunks
+    chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
+
+    for chunk in tqdm(chunks):
+        data = await get_data(chunk)
+        for symbol in chunk:
+            filtered_data = [item for item in data if item['symbol'] == symbol]
+            filtered_data = await filter_and_deduplicate(filtered_data)
+            if len(filtered_data) > 0:
+                await save_quote_as_json(symbol, filtered_data)
+            

 if __name__ == "__main__":
    try: