diff --git a/app/cron_business_metrics.py b/app/cron_business_metrics.py new file mode 100644 index 0000000..26af19f --- /dev/null +++ b/app/cron_business_metrics.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +""" + +SEC Filing Scraper +@author: AdamGetbags + +""" + +# import modules +import requests +import pandas as pd + +# create request header +headers = {'User-Agent': "email@address.com"} + +# get all companies data +companyTickers = requests.get( + "https://www.sec.gov/files/company_tickers.json", + headers=headers + ) + +# review response / keys +print(companyTickers.json().keys()) + +# format response to dictionary and get first key/value +firstEntry = companyTickers.json()['0'] + +# parse CIK // without leading zeros +directCik = companyTickers.json()['0']['cik_str'] + +# dictionary to dataframe +companyData = pd.DataFrame.from_dict(companyTickers.json(), + orient='index') + +# add leading zeros to CIK +companyData['cik_str'] = companyData['cik_str'].astype( + str).str.zfill(10) + +# review data +print(companyData[:1]) + +cik = companyData[0:1].cik_str[0] + +# get company specific filing metadata +filingMetadata = requests.get( + f'https://data.sec.gov/submissions/CIK{cik}.json', + headers=headers + ) + +# review json +print(filingMetadata.json().keys()) +filingMetadata.json()['filings'] +filingMetadata.json()['filings'].keys() +filingMetadata.json()['filings']['recent'] +filingMetadata.json()['filings']['recent'].keys() + +# dictionary to dataframe +allForms = pd.DataFrame.from_dict( + filingMetadata.json()['filings']['recent'] + ) + +# review columns +allForms.columns +allForms[['accessionNumber', 'reportDate', 'form']].head(50) + +# 10-Q metadata +allForms.iloc[11] + +# get company facts data +companyFacts = requests.get( + f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json', + headers=headers + ) + +#review data +companyFacts.json().keys() +companyFacts.json()['facts'] +companyFacts.json()['facts'].keys() + +# filing metadata +companyFacts.json()['facts']['dei'][ + 'EntityCommonStockSharesOutstanding'] +companyFacts.json()['facts']['dei'][ + 'EntityCommonStockSharesOutstanding'].keys() +companyFacts.json()['facts']['dei'][ + 'EntityCommonStockSharesOutstanding']['units'] +companyFacts.json()['facts']['dei'][ + 'EntityCommonStockSharesOutstanding']['units']['shares'] +companyFacts.json()['facts']['dei'][ + 'EntityCommonStockSharesOutstanding']['units']['shares'][0] + +# concept data // financial statement line items +companyFacts.json()['facts']['us-gaap'] +companyFacts.json()['facts']['us-gaap'].keys() + +# different amounts of data available per concept +companyFacts.json()['facts']['us-gaap']['AccountsPayable'] +companyFacts.json()['facts']['us-gaap']['Revenues'] +companyFacts.json()['facts']['us-gaap']['Assets'] + +# get company concept data +companyConcept = requests.get( + ( + f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}' + f'/us-gaap/Assets.json' + ), + headers=headers + ) + +# review data +companyConcept.json().keys() +companyConcept.json()['units'] +companyConcept.json()['units'].keys() +companyConcept.json()['units']['USD'] +companyConcept.json()['units']['USD'][0] + +# parse assets from single filing +companyConcept.json()['units']['USD'][0]['val'] + +# get all filings data +assetsData = pd.DataFrame.from_dict(( + companyConcept.json()['units']['USD'])) + +# review data +assetsData.columns +assetsData.form + +# get assets from 10Q forms and reset index +assets10Q = assetsData[assetsData.form == '10-Q'] +assets10Q = assets10Q.reset_index(drop=True) + +print(assets10Q) \ No newline at end of file diff --git a/app/cron_company_news.py b/app/cron_company_news.py index 5b93f55..931a441 100644 --- a/app/cron_company_news.py +++ b/app/cron_company_news.py @@ -37,45 +37,19 @@ async def filter_and_deduplicate(data, excluded_domains=None, deduplicate_key='t return filtered_data -async def fetch_news(session, url): - async with session.get(url) as response: - return await response.json() - -async def save_news(data, symbol): - #os.makedirs("json/market-news/companies", exist_ok=True) +async def save_quote_as_json(symbol, data): with open(f"json/market-news/companies/{symbol}.json", 'w') as file: ujson.dump(data, file) -async def process_symbols(symbols): - limit = 200 - chunk_size = 50 # Adjust this value based on API limitations - +async def get_data(chunk): + company_tickers = ','.join(chunk) async with aiohttp.ClientSession() as session: - for i in tqdm(range(0, len(symbols), chunk_size)): - chunk = symbols[i:i+chunk_size] - company_tickers = ','.join(chunk) - url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&limit={limit}&apikey={api_key}' - - data = await fetch_news(session, url) - - custom_domains = ['prnewswire.com', 'globenewswire.com', 'accesswire.com'] - data = await filter_and_deduplicate(data, excluded_domains=custom_domains) - - grouped_data = {} - for item in data: - symbol = item['symbol'] - if symbol in chunk: - if symbol not in grouped_data: - grouped_data[symbol] = [] - grouped_data[symbol].append(item) - - # Save the filtered data for each symbol in the chunk - tasks = [] - for symbol in chunk: - filtered_data = grouped_data.get(symbol, []) - tasks.append(save_news(filtered_data, symbol)) - - await asyncio.gather(*tasks) + url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&page=0&limit=2000&apikey={api_key}' + async with session.get(url) as response: + if response.status == 200: + return await response.json() + else: + return [] def get_symbols(db_name, table_name): with sqlite3.connect(db_name) as con: @@ -90,7 +64,17 @@ async def main(): crypto_symbols = get_symbols('crypto.db', 'cryptos') total_symbols = stock_symbols + etf_symbols + crypto_symbols - await process_symbols(total_symbols) + chunk_size = len(total_symbols) // 70 # Divide the list into N chunks + chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)] + + for chunk in tqdm(chunks): + data = await get_data(chunk) + for symbol in chunk: + filtered_data = [item for item in data if item['symbol'] == symbol] + filtered_data = await filter_and_deduplicate(filtered_data) + if len(filtered_data) > 0: + await save_quote_as_json(symbol, filtered_data) + if __name__ == "__main__": try: