update metrics cron job

This commit is contained in:
MuslemRahimi 2024-12-18 17:50:34 +01:00
parent cd7eb8d857
commit 8f3f1c49c9

View File

@ -1,468 +1,107 @@
from edgar import * from datetime import datetime, timedelta
import ast import orjson
import ujson import time
import sqlite3
import asyncio
import aiohttp
import random
from tqdm import tqdm from tqdm import tqdm
from datetime import datetime from dotenv import load_dotenv
from collections import defaultdict import os
import re
load_dotenv()
api_key = os.getenv('FMP_API_KEY')
#Tell the SEC who you are def convert_to_dict(data):
set_identity("Max Mustermann max.mustermann@indigo.com")
# Define quarter-end dates for a given year
#The last quarter Q4 result is not shown in any sec files
#But using the https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm 10-K you see the annual end result which can be subtracted with all Quarter results to obtain Q4 (dumb af but works so don't judge me people)
def format_name(name):
# Step 1: Insert spaces between camel case transitions (lowercase followed by uppercase)
formatted_name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
# Step 2: Replace "And" with "&"
formatted_name = formatted_name.replace("And", " & ").replace('Revenue','')
return formatted_name
def add_value_growth(data):
"""
Adds a new key 'valueGrowth' to each entry in the data list.
Parameters:
- data (list): A list of dictionaries containing date and value lists.
Returns:
- list: A new list with the 'valueGrowth' key added to each dictionary.
"""
# Initialize a new list for the output data
updated_data = []
# Loop through the data from the latest to the oldest
for i in range(len(data)):
try:
current_entry = data[i].copy() # Create a copy of the current entry
current_values = current_entry['value']
# Initialize the growth percentages list
if i < len(data) - 1: # Only compute growth if there is a next entry
next_values = data[i + 1]['value']
growth_percentages = []
for j in range(len(current_values)):
# Convert values to integers if they are strings
next_value = int(next_values[j]) if isinstance(next_values[j], (int, str)) else 0
current_value = int(current_values[j]) if isinstance(current_values[j], (int, str)) else 0
# Calculate growth percentage if next_value is not zero
if next_value != 0:
growth = round(((current_value - next_value) / next_value) * 100,2)
else:
growth = None # Cannot calculate growth if next value is zero
growth_percentages.append(growth)
current_entry['valueGrowth'] = growth_percentages # Add the growth percentages
else:
current_entry['valueGrowth'] = [None] * len(current_values) # No growth for the last entry
updated_data.append(current_entry) # Append the updated entry to the output list
except:
pass
return updated_data
def sort_by_latest_date_and_highest_value(data):
# Define a key function to convert the date string to a datetime object
# and use the negative of the integer value for descending order
def sort_key(item):
date = datetime.strptime(item['date'], '%Y-%m-%d')
value = -int(item['value']) # Negative for descending order
return (date, value)
# Sort the list
sorted_data = sorted(data, key=sort_key, reverse=True)
return sorted_data
def aggregate_other_values(data):
aggregated = defaultdict(int)
result = []
# First pass: aggregate 'Other' values and keep non-'Other' items
for item in data:
date = item['date']
value = int(item['value'])
if item['name'] == 'Other':
aggregated[date] += value
else:
result.append(item)
# Second pass: add aggregated 'Other' values
for date, value in aggregated.items():
result.append({'name': 'Other', 'value': int(value), 'date': date})
return sorted(result, key=lambda x: (x['date'], x['name']))
# Define quarter-end dates for a given year
def closest_quarter_end(date_str):
date = datetime.strptime(date_str, "%Y-%m-%d")
year = date.year
# Define quarter end dates for the current year
q1 = datetime(year, 3, 31)
q2 = datetime(year, 6, 30)
q3 = datetime(year, 9, 30)
q4 = datetime(year, 12, 31)
# If the date is in January, return the last day of Q4 of the previous year
if date.month == 1:
closest = datetime(year - 1, 12, 31) # Last quarter of the previous year
else:
# Adjust to next year's Q4 if the date is in the last quarter of the current year
if date >= q4:
closest = q4.replace(year=year + 1) # Next year's last quarter
else:
# Find the closest quarter date
closest = min([q1, q2, q3, q4], key=lambda d: abs(d - date))
# Return the closest quarter date in 'YYYY-MM-DD' format
return closest.strftime("%Y-%m-%d")
def compute_q4_results(dataset):
# Group data by year and name
yearly_data = defaultdict(lambda: defaultdict(dict))
for item in dataset:
date = datetime.strptime(item['date'], '%Y-%m-%d')
year = date.year
quarter = (date.month - 1) // 3 + 1
yearly_data[year][item['name']][quarter] = item['value']
# Calculate Q4 results and update dataset
for year in sorted(yearly_data.keys(), reverse=True):
for name, quarters in yearly_data[year].items():
if 4 in quarters: # This is the year-end total
total = quarters[4]
q1 = quarters.get(1, 0)
q2 = quarters.get(2, 0)
q3 = quarters.get(3, 0)
q4_value = total - (q1 + q2 + q3)
# Update the original dataset
for item in dataset:
if item['name'] == name and item['date'] == f'{year}-12-31':
item['value'] = q4_value
break
return dataset
def generate_geography_dataset(dataset):
country_replacements = {
"americas": "United States",
"unitedstates": "United States",
"videogamebrandsunitedstates": "United States",
"greaterchina": "China",
"country:us": "United States",
"country:cn": "China",
"chinaincludinghongkong": "China"
}
# Custom order for specific countries
custom_order = {
'United States': 2,
'China': 1,
'Other': 0
}
aggregated_data = {}
for item in dataset:
try:
name = item.get('name', '').lower()
date = item.get('date')
value = int(float(item.get('value', 0)))
year = int(date[:4])
if year < 2019:
continue # Skip this item if the year is less than 2019
# Replace country name if necessary
country_name = country_replacements.get(name, 'Other')
# Use (country_name, date) as the key to sum values
key = (country_name, date)
if key in aggregated_data:
aggregated_data[key] += value # Add the value if the country-date pair exists
else:
aggregated_data[key] = value # Initialize the value if new country-date pair
except:
pass
# Convert the aggregated data back into the desired list format
dataset = [{'name': country, 'date': date, 'value': total_value} for (country, date), total_value in aggregated_data.items()]
dataset = aggregate_other_values(dataset)
dataset = sorted(
dataset,
key=lambda item: (datetime.strptime(item['date'], '%Y-%m-%d'), custom_order.get(item['name'], 3)),
reverse = True
)
#dataset = compute_q4_results(dataset)
result = {} result = {}
unique_names = sorted( for entry in data:
list(set(item['name'] for item in dataset if item['name'] not in {'CloudServiceAgreements'})), for date, categories in entry.items():
key=lambda item: custom_order.get(item, 4), # Use 4 as default for items not in custom_order
reverse=True)
result = {}
# Iterate through the original data
for item in dataset:
# Get the date and value
date = item['date']
value = item['value']
# Initialize the dictionary for the date if not already done
if date not in result: if date not in result:
result[date] = {'date': date, 'value': []} result[date] = {}
for category, amount in categories.items():
result[date][category] = amount
# Append the value to the list return result
result[date]['value'].append(value)
# Convert the result dictionary to a list async def save_json(data, symbol):
res_list = list(result.values()) with open(f"json/business-metrics/{symbol}.json", 'wb') as file:
file.write(orjson.dumps(data))
# Print the final result def prepare_dataset(data):
res_list = add_value_growth(res_list) data = convert_to_dict(data)
res_list = {}
final_result = {'names': unique_names, 'history': res_list} revenue_name_list = []
return final_result revenue_history_list = []
index = 0
def generate_revenue_dataset(dataset): for date, info in data.items():
name_replacements = { value_list = []
"datacenter": "Data Center", for name, val in info.items():
"professionalvisualization": "Visualization", if index == 0:
"oemandother": "OEM & Other", revenue_name_list.append(name)
"automotive": "Automotive", if name in revenue_name_list:
"oemip": "OEM & Other", value_list.append(val)
"gaming": "Gaming", if len(value_list) > 0:
"mac": "Mac", revenue_history_list.append({'date': date, 'value': value_list})
"iphone": "IPhone", index +=1
"ipad": "IPad",
"wearableshomeandaccessories": "Wearables",
"hardwareandaccessories": "Hardware & Accessories",
"software": "Software",
"collectibles": "Collectibles",
"automotivesales": "Auto",
"automotiveleasing": "Auto Leasing",
"energygenerationandstoragesegment": "Energy and Storage",
"servicesandother": "Services & Other",
"automotiveregulatorycredits": "Regulatory Credits",
"intelligentcloud": "Intelligent Cloud",
"productivityandbusinessprocesses": "Productivity & Business",
"searchandnewsadvertising": "Advertising",
"linkedincorporation": "LinkedIn",
"morepersonalcomputing": "More Personal Computing",
"serviceother": "Service Other",
"governmentoperatingsegment": "Government Operating Segment",
"internationaldevelopmentallicensedmarketsandcorporate": "License Market",
"youtubeadvertisingrevenue": "Youtube Ads",
"googleadvertisingrevenue": "Google Ads",
"cloudservicesandlicensesupport": "Cloude Services & Support",
"infrastructurecloudservicesandlicensesupport": "Infrastructure Cloud",
"applicationscloudservicesandlicensesupport": "Application Cloud"
}
excluded_names = {'government','enterpriseembeddedandsemicustom','computingandgraphics','automotiveleasing ','officeproductsandcloudservices','serverproductsandcloudservices','automotiverevenues','automotive','computeandnetworking','graphics','gpu','automotivesegment','energygenerationandstoragesales','energygenerationandstorage','automotivesaleswithoutresalevalueguarantee','salesandservices','compute', 'networking', 'cloudserviceagreements', 'digital', 'allother', 'preownedvideogameproducts'}
dataset = [item for item in dataset if item['name'].lower() not in excluded_names]
# Find all unique names and dates
all_dates = sorted(set(item['date'] for item in dataset))
all_names = sorted(set(item['name'] for item in dataset))
dataset = [revenue for revenue in dataset if revenue['name'].lower() not in excluded_names]
# Check and fill missing combinations at the beginning
name_date_map = defaultdict(lambda: defaultdict(lambda: None))
for item in dataset:
name_date_map[item['name']][item['date']] = item['value']
# Ensure all names have entries for all dates
for name in all_names:
for date in all_dates:
if date not in name_date_map[name]:
dataset.append({'name': name, 'date': date, 'value': None})
# Clean and process the dataset values
processed_dataset = []
for item in dataset:
if item['value'] not in (None, '', 0):
processed_dataset.append({
'name': item['name'],
'date': item['date'],
'value': int(float(item['value']))
})
else:
processed_dataset.append({
'name': item['name'],
'date': item['date'],
'value': None
})
dataset = processed_dataset
#If the last value of the latest date is null or 0 remove all names in the list revenue_history_list = sorted(revenue_history_list, key=lambda x: datetime.strptime(x['date'], '%Y-%m-%d'))
dataset = sorted(dataset, key=lambda item: datetime.strptime(item['date'], '%Y-%m-%d'), reverse=True)
remember_names = set() # Use a set for faster membership checks
first_date = dataset[0]['date'] # Initialize 'valueGrowth' as None for all entries
for item in revenue_history_list:
item['valueGrowth'] = [None] * len(item['value'])
# Iterate through dataset to remember names where date matches first_date and value is None # Calculate valueGrowth for each item based on the previous date value
for item in dataset: for i in range(1, len(revenue_history_list)): # Start from the second item
if item['date'] == first_date and (item['value'] == None or item['value'] == 0): current_item = revenue_history_list[i]
remember_names.add(item['name']) prev_item = revenue_history_list[i - 1]
print(item['name'])
# Use list comprehension to filter items not in remember_names value_growth = []
dataset = [{**item} for item in dataset if item['name'] not in remember_names] for cur_value, prev_value in zip(current_item['value'], prev_item['value']):
growth = round(((cur_value - prev_value) / prev_value) * 100, 2)
value_growth.append(growth)
current_item['valueGrowth'] = value_growth
dataset = [ item for item in dataset if datetime.strptime(item['date'], '%Y-%m-%d').year >= 2019] revenue_history_list = sorted(revenue_history_list, key=lambda x: datetime.strptime(x['date'], '%Y-%m-%d'), reverse=True)
# Group by name and calculate total value res_list = {'revenue': {'names': revenue_name_list, 'history': revenue_history_list}}
name_totals = defaultdict(int)
for item in dataset:
name_totals[item['name']] += item['value'] if item['value'] != None else 0
# Sort names by total value and get top 5, ensuring excluded names are not considered return res_list
top_names = sorted(
[(name, total) for name, total in name_totals.items() if name.lower() not in excluded_names],
key=lambda x: x[1],
reverse=True
)[:5]
top_names = [name for name, _ in top_names]
# Filter dataset to include only top 5 names
dataset = [item for item in dataset if item['name'] in top_names]
# Sort the dataset
dataset.sort(key=lambda item: (datetime.strptime(item['date'], '%Y-%m-%d'), item['value'] if item['value'] != None else 0), reverse=True)
top_names = [name_replacements.get(name.lower(), format_name(name)) for name in top_names]
print(top_names)
result = {}
for item in dataset:
date = item['date']
value = item['value']
if date not in result:
result[date] = {'date': date, 'value': []}
result[date]['value'].append(value)
async def get_data(session, total_symbols):
# Convert the result dictionary to a list for symbol in total_symbols:
res_list = list(result.values()) url = f"https://financialmodelingprep.com/api/v4/revenue-product-segmentation?symbol={symbol}&structure=flat&period=quarter&apikey={api_key}"
# Add value growth (assuming add_value_growth function exists)
res_list = add_value_growth(res_list)
final_result = {'names': top_names, 'history': res_list}
return final_result
def process_filings(filings, symbol):
revenue_sources = []
geography_sources = []
for i in range(0,17):
try: try:
filing_xbrl = filings[i].xbrl() async with session.get(url) as response:
facts = filing_xbrl.facts.data if response.status == 200:
latest_rows = facts.groupby('dimensions').head(1) data = await response.json()
if len(data) > 0:
for index, row in latest_rows.iterrows(): data = prepare_dataset(data)
dimensions_str = row.get("dimensions", "{}") await save_json(data, symbol)
try:
dimensions_dict = ast.literal_eval(dimensions_str) if isinstance(dimensions_str, str) else dimensions_str
except (ValueError, SyntaxError):
dimensions_dict = {}
#print(dimensions_dict)
for column_name in [
"srt:StatementGeographicalAxis",
"us-gaap:StatementBusinessSegmentsAxis",
"srt:ProductOrServiceAxis",
]:
product_dimension = dimensions_dict.get(column_name) if isinstance(dimensions_dict, dict) else None
if row["namespace"] == "us-gaap" and product_dimension is not None and (
product_dimension.startswith(symbol.lower() + ":") or
product_dimension.startswith("country" + ":") or
product_dimension.startswith("us-gaap"+":") or
product_dimension.startswith("srt"+":") or
product_dimension.startswith("goog"+":")
):
replacements = {
"Member": "",
"VideoGameAccessories": "HardwareAndAccessories",
"NewVideoGameHardware": "HardwareAndAccessories",
"NewVideoGameSoftware": "Software",
f"{symbol.lower()}:": "",
"goog:": "",
"us-gaap:": "",
"srt:": "",
"SegmentMember": "",
}
name = product_dimension
for old, new in replacements.items():
name = name.replace(old, new)
if symbol in ['ORCL','SAVE','BA','NFLX','LLY','MSFT','META','NVDA','AAPL','GME']:
column_list = ["srt:ProductOrServiceAxis"]
else:
column_list = ["srt:ProductOrServiceAxis", "us-gaap:StatementBusinessSegmentsAxis"]
if column_name in column_list:
revenue_sources.append({"name": name, "value": row["value"], "date": row["end_date"]})
else:
geography_sources.append({"name": name, "value": row["value"], "date": row["end_date"]})
except Exception as e: except Exception as e:
print(e) print(f"Error fetching data for {symbol}: {e}")
pass
return revenue_sources, geography_sources
def run(symbol): async def run():
# First try with 10-Q filings only con = sqlite3.connect('stocks.db')
filings = Company(symbol).get_filings(form=["10-Q"]).latest(20) cursor = con.cursor()
revenue_sources, geography_sources = process_filings(filings, symbol) cursor.execute("PRAGMA journal_mode = wal")
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
total_symbols = [row[0] for row in cursor.fetchall()]
total_symbols = ['AAPL'] # For testing purposes
con.close()
# If no geography sources found, try with 10-K filings async with aiohttp.ClientSession() as session:
if not geography_sources: await get_data(session, total_symbols)
print(f"No geography sources found in 10-Q for {symbol}, checking 10-K filings...")
filings_10k = Company(symbol).get_filings(form=["10-K"]).latest(20)
_, geography_sources = process_filings(filings_10k, symbol)
print(revenue_sources)
#print(geography_sources)
revenue_dataset = generate_revenue_dataset(revenue_sources)
geographic_dataset = generate_geography_dataset(geography_sources)
final_dataset = {'revenue': revenue_dataset, 'geographic': geographic_dataset}
with open(f"json/business-metrics/{symbol}.json", "w") as file:
ujson.dump(final_dataset, file)
if __name__ == "__main__": if __name__ == "__main__":
for symbol in ['ORCL']: #['ORCL','GOOGL','AMD','SAVE','BA','ADBE','NFLX','PLTR','MSFT','META','TSLA','NVDA','AAPL','GME']: loop = asyncio.get_event_loop()
run(symbol) loop.run_until_complete(run())