bugfixing company_news
This commit is contained in:
parent
b062213410
commit
c7cad764b3
132
app/cron_business_metrics.py
Normal file
132
app/cron_business_metrics.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
|
||||||
|
SEC Filing Scraper
|
||||||
|
@author: AdamGetbags
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# import modules
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# create request header
|
||||||
|
headers = {'User-Agent': "email@address.com"}
|
||||||
|
|
||||||
|
# get all companies data
|
||||||
|
companyTickers = requests.get(
|
||||||
|
"https://www.sec.gov/files/company_tickers.json",
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# review response / keys
|
||||||
|
print(companyTickers.json().keys())
|
||||||
|
|
||||||
|
# format response to dictionary and get first key/value
|
||||||
|
firstEntry = companyTickers.json()['0']
|
||||||
|
|
||||||
|
# parse CIK // without leading zeros
|
||||||
|
directCik = companyTickers.json()['0']['cik_str']
|
||||||
|
|
||||||
|
# dictionary to dataframe
|
||||||
|
companyData = pd.DataFrame.from_dict(companyTickers.json(),
|
||||||
|
orient='index')
|
||||||
|
|
||||||
|
# add leading zeros to CIK
|
||||||
|
companyData['cik_str'] = companyData['cik_str'].astype(
|
||||||
|
str).str.zfill(10)
|
||||||
|
|
||||||
|
# review data
|
||||||
|
print(companyData[:1])
|
||||||
|
|
||||||
|
cik = companyData[0:1].cik_str[0]
|
||||||
|
|
||||||
|
# get company specific filing metadata
|
||||||
|
filingMetadata = requests.get(
|
||||||
|
f'https://data.sec.gov/submissions/CIK{cik}.json',
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# review json
|
||||||
|
print(filingMetadata.json().keys())
|
||||||
|
filingMetadata.json()['filings']
|
||||||
|
filingMetadata.json()['filings'].keys()
|
||||||
|
filingMetadata.json()['filings']['recent']
|
||||||
|
filingMetadata.json()['filings']['recent'].keys()
|
||||||
|
|
||||||
|
# dictionary to dataframe
|
||||||
|
allForms = pd.DataFrame.from_dict(
|
||||||
|
filingMetadata.json()['filings']['recent']
|
||||||
|
)
|
||||||
|
|
||||||
|
# review columns
|
||||||
|
allForms.columns
|
||||||
|
allForms[['accessionNumber', 'reportDate', 'form']].head(50)
|
||||||
|
|
||||||
|
# 10-Q metadata
|
||||||
|
allForms.iloc[11]
|
||||||
|
|
||||||
|
# get company facts data
|
||||||
|
companyFacts = requests.get(
|
||||||
|
f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
#review data
|
||||||
|
companyFacts.json().keys()
|
||||||
|
companyFacts.json()['facts']
|
||||||
|
companyFacts.json()['facts'].keys()
|
||||||
|
|
||||||
|
# filing metadata
|
||||||
|
companyFacts.json()['facts']['dei'][
|
||||||
|
'EntityCommonStockSharesOutstanding']
|
||||||
|
companyFacts.json()['facts']['dei'][
|
||||||
|
'EntityCommonStockSharesOutstanding'].keys()
|
||||||
|
companyFacts.json()['facts']['dei'][
|
||||||
|
'EntityCommonStockSharesOutstanding']['units']
|
||||||
|
companyFacts.json()['facts']['dei'][
|
||||||
|
'EntityCommonStockSharesOutstanding']['units']['shares']
|
||||||
|
companyFacts.json()['facts']['dei'][
|
||||||
|
'EntityCommonStockSharesOutstanding']['units']['shares'][0]
|
||||||
|
|
||||||
|
# concept data // financial statement line items
|
||||||
|
companyFacts.json()['facts']['us-gaap']
|
||||||
|
companyFacts.json()['facts']['us-gaap'].keys()
|
||||||
|
|
||||||
|
# different amounts of data available per concept
|
||||||
|
companyFacts.json()['facts']['us-gaap']['AccountsPayable']
|
||||||
|
companyFacts.json()['facts']['us-gaap']['Revenues']
|
||||||
|
companyFacts.json()['facts']['us-gaap']['Assets']
|
||||||
|
|
||||||
|
# get company concept data
|
||||||
|
companyConcept = requests.get(
|
||||||
|
(
|
||||||
|
f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}'
|
||||||
|
f'/us-gaap/Assets.json'
|
||||||
|
),
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# review data
|
||||||
|
companyConcept.json().keys()
|
||||||
|
companyConcept.json()['units']
|
||||||
|
companyConcept.json()['units'].keys()
|
||||||
|
companyConcept.json()['units']['USD']
|
||||||
|
companyConcept.json()['units']['USD'][0]
|
||||||
|
|
||||||
|
# parse assets from single filing
|
||||||
|
companyConcept.json()['units']['USD'][0]['val']
|
||||||
|
|
||||||
|
# get all filings data
|
||||||
|
assetsData = pd.DataFrame.from_dict((
|
||||||
|
companyConcept.json()['units']['USD']))
|
||||||
|
|
||||||
|
# review data
|
||||||
|
assetsData.columns
|
||||||
|
assetsData.form
|
||||||
|
|
||||||
|
# get assets from 10Q forms and reset index
|
||||||
|
assets10Q = assetsData[assetsData.form == '10-Q']
|
||||||
|
assets10Q = assets10Q.reset_index(drop=True)
|
||||||
|
|
||||||
|
print(assets10Q)
|
||||||
@ -37,45 +37,19 @@ async def filter_and_deduplicate(data, excluded_domains=None, deduplicate_key='t
|
|||||||
|
|
||||||
return filtered_data
|
return filtered_data
|
||||||
|
|
||||||
async def fetch_news(session, url):
|
async def save_quote_as_json(symbol, data):
|
||||||
async with session.get(url) as response:
|
|
||||||
return await response.json()
|
|
||||||
|
|
||||||
async def save_news(data, symbol):
|
|
||||||
#os.makedirs("json/market-news/companies", exist_ok=True)
|
|
||||||
with open(f"json/market-news/companies/{symbol}.json", 'w') as file:
|
with open(f"json/market-news/companies/{symbol}.json", 'w') as file:
|
||||||
ujson.dump(data, file)
|
ujson.dump(data, file)
|
||||||
|
|
||||||
async def process_symbols(symbols):
|
async def get_data(chunk):
|
||||||
limit = 200
|
|
||||||
chunk_size = 50 # Adjust this value based on API limitations
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
for i in tqdm(range(0, len(symbols), chunk_size)):
|
|
||||||
chunk = symbols[i:i+chunk_size]
|
|
||||||
company_tickers = ','.join(chunk)
|
company_tickers = ','.join(chunk)
|
||||||
url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&limit={limit}&apikey={api_key}'
|
async with aiohttp.ClientSession() as session:
|
||||||
|
url = f'https://financialmodelingprep.com/api/v3/stock_news?tickers={company_tickers}&page=0&limit=2000&apikey={api_key}'
|
||||||
data = await fetch_news(session, url)
|
async with session.get(url) as response:
|
||||||
|
if response.status == 200:
|
||||||
custom_domains = ['prnewswire.com', 'globenewswire.com', 'accesswire.com']
|
return await response.json()
|
||||||
data = await filter_and_deduplicate(data, excluded_domains=custom_domains)
|
else:
|
||||||
|
return []
|
||||||
grouped_data = {}
|
|
||||||
for item in data:
|
|
||||||
symbol = item['symbol']
|
|
||||||
if symbol in chunk:
|
|
||||||
if symbol not in grouped_data:
|
|
||||||
grouped_data[symbol] = []
|
|
||||||
grouped_data[symbol].append(item)
|
|
||||||
|
|
||||||
# Save the filtered data for each symbol in the chunk
|
|
||||||
tasks = []
|
|
||||||
for symbol in chunk:
|
|
||||||
filtered_data = grouped_data.get(symbol, [])
|
|
||||||
tasks.append(save_news(filtered_data, symbol))
|
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
|
|
||||||
def get_symbols(db_name, table_name):
|
def get_symbols(db_name, table_name):
|
||||||
with sqlite3.connect(db_name) as con:
|
with sqlite3.connect(db_name) as con:
|
||||||
@ -90,7 +64,17 @@ async def main():
|
|||||||
crypto_symbols = get_symbols('crypto.db', 'cryptos')
|
crypto_symbols = get_symbols('crypto.db', 'cryptos')
|
||||||
total_symbols = stock_symbols + etf_symbols + crypto_symbols
|
total_symbols = stock_symbols + etf_symbols + crypto_symbols
|
||||||
|
|
||||||
await process_symbols(total_symbols)
|
chunk_size = len(total_symbols) // 70 # Divide the list into N chunks
|
||||||
|
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
|
||||||
|
|
||||||
|
for chunk in tqdm(chunks):
|
||||||
|
data = await get_data(chunk)
|
||||||
|
for symbol in chunk:
|
||||||
|
filtered_data = [item for item in data if item['symbol'] == symbol]
|
||||||
|
filtered_data = await filter_and_deduplicate(filtered_data)
|
||||||
|
if len(filtered_data) > 0:
|
||||||
|
await save_quote_as_json(symbol, filtered_data)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user