From 064b7ad10d37c2d7ac1ee63e82d55f2c5ba4fa66 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Tue, 6 Aug 2024 22:24:50 +0200 Subject: [PATCH] add corporate lobbying cron job --- app/cron_corporate_lobbying.py | 404 +++++++++++++++++++++++++++++++++ requirements.txt | 4 +- 2 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 app/cron_corporate_lobbying.py diff --git a/app/cron_corporate_lobbying.py b/app/cron_corporate_lobbying.py new file mode 100644 index 0000000..c42d041 --- /dev/null +++ b/app/cron_corporate_lobbying.py @@ -0,0 +1,404 @@ +# Imports from python. +from csv import DictWriter +from datetime import datetime +import json +from math import ceil +import os +from time import sleep +import pandas as pd +from dotenv import load_dotenv +import requests + + + +BASE_SESSION = requests.Session() + +BASE_API_URL = "https://lda.senate.gov/api/v1" + +load_dotenv() +API_KEY = os.getenv('SENATE_API_KEY') + + +LDA_API_ENDPOINTS = dict( + filing_types=f"{BASE_API_URL}/constants/filing/filingtypes/", + filings=f"{BASE_API_URL}/filings/", +) + +# Sadly, the Senate lowered the max results per page to 25. +# RESULTS_PER_PAGE = 250 +RESULTS_PER_PAGE = 250 + +TIME_PERIOD_SLUGS = dict( + Q1="first_quarter", + Q2="second_quarter", + Q3="third_quarter", + Q4="fourth_quarter", + MY="mid_year", + YE="year_end", +) + +TIME_PERIOD_PREFIXES = dict( + Q1="1st Quarter", + Q2="2nd Quarter", + Q3="3rd Quarter", + Q4="4th Quarter", + MY="Mid-Year", + YE="Year-End", +) + + +def parse_safe_query_dict(raw_dict): + return "&".join([f"{k}={v}" for k, v in raw_dict.items()]) + +def querystring_to_dict(raw_url): + return { + k: v + for d in [ + dict([_.split("=")]) for _ in raw_url.split("?")[1].split("&") + ] + for k, v in d.items() + } + + +with open("json/corporate-lobbying/self_lobbying_overrides.json", "r") as input: + SELF_LOBBYING_OVERRIDES = json.load(input) + + +def get_types_for_quarter(time_period, common_session=None): + session = BASE_SESSION if common_session is None else common_session + + rq = requests.Request( + "GET", + LDA_API_ENDPOINTS["filing_types"], + headers={ + "Accept-Encoding": "gzip,deflate,br", + "Accept": "application/json", + "Authorization": f'Token {API_KEY}', + }, + ).prepare() + + request_result = session.send(rq) + + if 200 <= request_result.status_code <= 299: + all_types = json.loads(request_result.text) + + return [ + type_dict + for type_dict in all_types + if type_dict["name"].startswith(TIME_PERIOD_PREFIXES[time_period]) + ] + + return [] + + +def get_filings_page(time_config, common_session=None, extra_fetch_params={}): + session = BASE_SESSION if common_session is None else common_session + + query_dict = dict( + **time_config, + # ordering="-dt_posted,id", + ordering="dt_posted,id", + page_size=RESULTS_PER_PAGE, + **extra_fetch_params, + ) + + rq = requests.Request( + "GET", + f"{LDA_API_ENDPOINTS['filings']}?{parse_safe_query_dict(query_dict)}", + headers={ + "Accept-Encoding": "gzip,deflate,br", + "Accept": "application/json", + "Authorization": f'Token {API_KEY}', + }, + ).prepare() + + request_result = session.send(rq) + + if 200 <= request_result.status_code <= 299: + return dict( + range=200, + status=request_result.status_code, + headers=request_result.headers, + body=json.loads(request_result.text), + ) + elif 400 <= request_result.status_code <= 499: + return dict( + range=400, + status=request_result.status_code, + headers=request_result.headers, + body=None, + ) + + return dict( + status=request_result.status_code, + headers=request_result.headers, + body=None, + ) + + +def commonize(raw_value): + formatted_value = ( + raw_value.lower() + .replace(".", "") + .replace(",", "") + .replace("(", "") + .replace(")", "") + .replace(" u s a ", " usa ") + .replace(" u.s. ", " us ") + .replace(" u s ", " us ") + .replace(" ", " ") + .strip() + ) + + if formatted_value.endswith(" us a"): + formatted_value = f"{formatted_value[:-5]} usa" + elif formatted_value.endswith(" u s"): + formatted_value = f"{formatted_value[:-4]} us" + + if formatted_value.startswith("the "): + return formatted_value[4:] + + elif formatted_value.startswith("u.s. "): + return f"us {formatted_value[5:]}" + elif formatted_value.startswith("u s "): + return f"us {formatted_value[4:]}" + + return formatted_value + + +def process_result(raw_result, type_dict): + posting_date = datetime.fromisoformat(raw_result["dt_posted"]) + + registrant_name = raw_result["registrant"]["name"] + client_name = raw_result["client"]["name"] + + amount_reported = raw_result["income"] + amount_type = "income" + + if all( + [ + raw_result["income"] is None, + commonize(registrant_name) == commonize(client_name), + ] + ): + amount_reported = raw_result["expenses"] + amount_type = "expenses" + + if amount_type == "income" and raw_result["income"] is None: + matching_overrides = [ + override_dict + for override_dict in SELF_LOBBYING_OVERRIDES + if override_dict["registrantName"] == registrant_name + and override_dict["clientName"] == client_name + ] + + if matching_overrides: + amount_reported = raw_result["expenses"] + amount_type = "expenses*" + + return dict( + UUID=raw_result["filing_uuid"], + RegistrantName=registrant_name, + ClientName=client_name, + FilingType=type_dict[raw_result["filing_type"]].replace(" - ", " "), + AmountReported=amount_reported, + DatePosted=posting_date.strftime("%Y-%m-%d"), + FilingYear=raw_result["filing_year"], + AmountType=amount_type, + ) + + +def collect_filings(time_config, type_dict, session): + current_page = get_filings_page(time_config, session) + + results_count = current_page["body"]["count"] + results_lang = "filings" if results_count != 1 else "filing" + + page_count = ceil(results_count / RESULTS_PER_PAGE) + page_lang = "pages" if page_count != 1 else "page" + + print(f" ### {results_count} {results_lang} / {page_count} {page_lang}") + + all_filings = [ + process_result(result, type_dict) + for result in current_page["body"]["results"] + ] + + print(" - PAGE 1") + + while current_page["body"]["next"] is not None: + next_query_dict = querystring_to_dict(current_page["body"]["next"]) + + next_query_diff = { + k: v + for k, v in next_query_dict.items() + if k not in [*time_config.keys(), "ordering", "page_size"] + } + + sleep(1) + + current_page = get_filings_page(time_config, session, next_query_diff) + + print(f" - PAGE {next_query_diff['page']}") + + all_filings.extend( + [ + process_result(result, type_dict) + for result in current_page["body"]["results"] + ] + ) + + return all_filings + + +def scrape_lda_filings(year, time_period, common_session=None): + session = BASE_SESSION if common_session is None else common_session + + types_for_period = get_types_for_quarter(time_period, session) + + type_dict = { + filing_type["value"]: filing_type["name"] + for filing_type in types_for_period + } + + all_filings = {} + + for filing_type in types_for_period: + print("") + + print(f"{filing_type['name']} ({filing_type['value']}):") + + time_config = dict( + filing_year=year, + filing_period=TIME_PERIOD_SLUGS[time_period], + filing_type=filing_type["value"], + ) + + all_filings[filing_type["value"]] = collect_filings( + time_config, + type_dict, + session, + ) + + print("") + + with open(f"json/corporate-lobbying/reports/{year}-{time_period.lower()}.csv", "w") as output_file: + writer = DictWriter( + output_file, + fieldnames=[ + "UUID", + "RegistrantName", + "ClientName", + "FilingType", + "AmountReported", + "DatePosted", + "FilingYear", + "AmountType", + ], + ) + writer.writeheader() + + for type_slug, filings_for_type in all_filings.items(): + for filing in filings_for_type: + writer.writerow(filing) + + return all_filings + + +def get_historical_data(): + current_year = datetime.now().year + year_list = list(range(2015, current_year + 1)) + quarter_list = ['Q1', 'Q2', 'Q3', 'Q4'] + print(year_list) + + for year in year_list: + for quarter in quarter_list: + file_name = f"{year}-{quarter.lower()}" + print(file_name) + if not os.path.exists(f"json/corporate-lobbying/reports/{file_name}.csv"): + scrape_lda_filings(year, quarter) + else: + print(f"Skipping {file_name}, file already exists.") + +def get_current_quarter_and_year(): + current_date = datetime.now() + current_month = current_date.month + current_year = str(current_date.year) + + if 1 <= current_month <= 3: + quarter = "Q1" + elif 4 <= current_month <= 6: + quarter = "Q2" + elif 7 <= current_month <= 9: + quarter = "Q3" + else: + quarter = "Q4" + + return current_year, quarter + +def update_latest_quarter(): + year, quarter = get_current_quarter_and_year() + print(year, quarter) + scrape_lda_filings(year, quarter) + +def save_json(symbol, data): + with open(f"json/corporate-lobbying/companies/{symbol}.json", 'w') as file: + json.dump(data, file) + +def create_dataset(): + from fuzzywuzzy import process + import sqlite3 + import math + from collections import defaultdict + + con = sqlite3.connect('stocks.db') + cursor = con.cursor() + cursor.execute("PRAGMA journal_mode = wal") + cursor.execute("SELECT DISTINCT symbol,name FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") + stock_data = [{'symbol': row[0], 'name': row[1]} for row in cursor.fetchall()] + con.close() + # Set a threshold for similarity (0-100) + threshold = 95 + + + # Get all CSV files in the reports folder + reports_folder = 'json/corporate-lobbying/reports' + csv_files = [f for f in os.listdir(reports_folder) if f.endswith('.csv')] + + for stock in stock_data: + all_res_list = [] + print(stock['name']) + for csv_file in csv_files: + # Read the CSV file into a DataFrame + print(csv_file) + df = pd.read_csv(os.path.join(reports_folder, csv_file)) + + # Convert the DataFrame to a list of dictionaries + df_list = df.to_dict(orient='records') + + for item in df_list: + company_name = item['ClientName'] + + best_match, score = process.extractOne(stock['name'].lower(), [company_name.lower()]) + if score >= threshold: + all_res_list.append({'amount': item['AmountReported'], 'year': item['FilingYear']}) + + all_res_list = [item for item in all_res_list if isinstance(item.get("amount"), (int, float)) and not math.isnan(item["amount"])] + + # Group amounts by year + year_totals = defaultdict(float) + for item in all_res_list: + year_totals[item['year']] += item['amount'] + + all_res_list = [{'year': year, 'amount': amount} for year, amount in year_totals.items()] + + + if len(all_res_list) > 0: + save_json(stock['symbol'], all_res_list) + print(f"Saved data for {stock['symbol']} ({len(all_res_list)} matches)") + +if '__main__' == __name__: + + #get_historical_data() + #update_latest_quarter() + create_dataset() diff --git a/requirements.txt b/requirements.txt index c05185a..1f821ff 100755 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,6 @@ finnhub-python intrinio_sdk openai slowapi -praw \ No newline at end of file +praw +fuzzywuzzy +python-Levenshtein \ No newline at end of file