From 064b7ad10d37c2d7ac1ee63e82d55f2c5ba4fa66 Mon Sep 17 00:00:00 2001
From: MuslemRahimi <moslem_rahimi@hotmail.de>
Date: Tue, 6 Aug 2024 22:24:50 +0200
Subject: [PATCH] add corporate lobbying cron job

---
 app/cron_corporate_lobbying.py | 404 +++++++++++++++++++++++++++++++++
 requirements.txt               |   4 +-
 2 files changed, 407 insertions(+), 1 deletion(-)
 create mode 100644 app/cron_corporate_lobbying.py

diff --git a/app/cron_corporate_lobbying.py b/app/cron_corporate_lobbying.py
new file mode 100644
index 0000000..c42d041
--- /dev/null
+++ b/app/cron_corporate_lobbying.py
@@ -0,0 +1,404 @@
+# Imports from python.
+from csv import DictWriter
+from datetime import datetime
+import json
+from math import ceil
+import os
+from time import sleep
+import pandas as pd
+from dotenv import load_dotenv
+import requests
+
+
+
+BASE_SESSION = requests.Session()
+
+BASE_API_URL = "https://lda.senate.gov/api/v1"
+
+load_dotenv()
+API_KEY = os.getenv('SENATE_API_KEY')
+
+
+LDA_API_ENDPOINTS = dict(
+    filing_types=f"{BASE_API_URL}/constants/filing/filingtypes/",
+    filings=f"{BASE_API_URL}/filings/",
+)
+
+# Sadly, the Senate lowered the max results per page to 25.
+# RESULTS_PER_PAGE = 250
+RESULTS_PER_PAGE = 250
+
+TIME_PERIOD_SLUGS = dict(
+    Q1="first_quarter",
+    Q2="second_quarter",
+    Q3="third_quarter",
+    Q4="fourth_quarter",
+    MY="mid_year",
+    YE="year_end",
+)
+
+TIME_PERIOD_PREFIXES = dict(
+    Q1="1st Quarter",
+    Q2="2nd Quarter",
+    Q3="3rd Quarter",
+    Q4="4th Quarter",
+    MY="Mid-Year",
+    YE="Year-End",
+)
+
+
+def parse_safe_query_dict(raw_dict):
+    return "&".join([f"{k}={v}" for k, v in raw_dict.items()])
+
+def querystring_to_dict(raw_url):
+    return {
+        k: v
+        for d in [
+            dict([_.split("=")]) for _ in raw_url.split("?")[1].split("&")
+        ]
+        for k, v in d.items()
+    }
+
+
+with open("json/corporate-lobbying/self_lobbying_overrides.json", "r") as input:
+    SELF_LOBBYING_OVERRIDES = json.load(input)
+
+
+def get_types_for_quarter(time_period, common_session=None):
+    session = BASE_SESSION if common_session is None else common_session
+
+    rq = requests.Request(
+        "GET",
+        LDA_API_ENDPOINTS["filing_types"],
+        headers={
+            "Accept-Encoding": "gzip,deflate,br",
+            "Accept": "application/json",
+            "Authorization": f'Token {API_KEY}',
+        },
+    ).prepare()
+
+    request_result = session.send(rq)
+
+    if 200 <= request_result.status_code <= 299:
+        all_types = json.loads(request_result.text)
+
+        return [
+            type_dict
+            for type_dict in all_types
+            if type_dict["name"].startswith(TIME_PERIOD_PREFIXES[time_period])
+        ]
+
+    return []
+
+
+def get_filings_page(time_config, common_session=None, extra_fetch_params={}):
+    session = BASE_SESSION if common_session is None else common_session
+
+    query_dict = dict(
+        **time_config,
+        # ordering="-dt_posted,id",
+        ordering="dt_posted,id",
+        page_size=RESULTS_PER_PAGE,
+        **extra_fetch_params,
+    )
+
+    rq = requests.Request(
+        "GET",
+        f"{LDA_API_ENDPOINTS['filings']}?{parse_safe_query_dict(query_dict)}",
+        headers={
+            "Accept-Encoding": "gzip,deflate,br",
+            "Accept": "application/json",
+            "Authorization": f'Token {API_KEY}',
+        },
+    ).prepare()
+
+    request_result = session.send(rq)
+
+    if 200 <= request_result.status_code <= 299:
+        return dict(
+            range=200,
+            status=request_result.status_code,
+            headers=request_result.headers,
+            body=json.loads(request_result.text),
+        )
+    elif 400 <= request_result.status_code <= 499:
+        return dict(
+            range=400,
+            status=request_result.status_code,
+            headers=request_result.headers,
+            body=None,
+        )
+
+    return dict(
+        status=request_result.status_code,
+        headers=request_result.headers,
+        body=None,
+    )
+
+
+def commonize(raw_value):
+    formatted_value = (
+        raw_value.lower()
+        .replace(".", "")
+        .replace(",", "")
+        .replace("(", "")
+        .replace(")", "")
+        .replace(" u s a ", " usa ")
+        .replace(" u.s. ", " us ")
+        .replace(" u s ", " us ")
+        .replace("  ", " ")
+        .strip()
+    )
+
+    if formatted_value.endswith(" us a"):
+        formatted_value = f"{formatted_value[:-5]} usa"
+    elif formatted_value.endswith(" u s"):
+        formatted_value = f"{formatted_value[:-4]} us"
+
+    if formatted_value.startswith("the "):
+        return formatted_value[4:]
+
+    elif formatted_value.startswith("u.s. "):
+        return f"us {formatted_value[5:]}"
+    elif formatted_value.startswith("u s "):
+        return f"us {formatted_value[4:]}"
+
+    return formatted_value
+
+
+def process_result(raw_result, type_dict):
+    posting_date = datetime.fromisoformat(raw_result["dt_posted"])
+
+    registrant_name = raw_result["registrant"]["name"]
+    client_name = raw_result["client"]["name"]
+
+    amount_reported = raw_result["income"]
+    amount_type = "income"
+
+    if all(
+        [
+            raw_result["income"] is None,
+            commonize(registrant_name) == commonize(client_name),
+        ]
+    ):
+        amount_reported = raw_result["expenses"]
+        amount_type = "expenses"
+
+    if amount_type == "income" and raw_result["income"] is None:
+        matching_overrides = [
+            override_dict
+            for override_dict in SELF_LOBBYING_OVERRIDES
+            if override_dict["registrantName"] == registrant_name
+            and override_dict["clientName"] == client_name
+        ]
+
+        if matching_overrides:
+            amount_reported = raw_result["expenses"]
+            amount_type = "expenses*"
+
+    return dict(
+        UUID=raw_result["filing_uuid"],
+        RegistrantName=registrant_name,
+        ClientName=client_name,
+        FilingType=type_dict[raw_result["filing_type"]].replace(" - ", " "),
+        AmountReported=amount_reported,
+        DatePosted=posting_date.strftime("%Y-%m-%d"),
+        FilingYear=raw_result["filing_year"],
+        AmountType=amount_type,
+    )
+
+
+def collect_filings(time_config, type_dict, session):
+    current_page = get_filings_page(time_config, session)
+
+    results_count = current_page["body"]["count"]
+    results_lang = "filings" if results_count != 1 else "filing"
+
+    page_count = ceil(results_count / RESULTS_PER_PAGE)
+    page_lang = "pages" if page_count != 1 else "page"
+
+    print(f"  ### {results_count} {results_lang} / {page_count} {page_lang}")
+
+    all_filings = [
+        process_result(result, type_dict)
+        for result in current_page["body"]["results"]
+    ]
+
+    print("  - PAGE 1")
+
+    while current_page["body"]["next"] is not None:
+        next_query_dict = querystring_to_dict(current_page["body"]["next"])
+
+        next_query_diff = {
+            k: v
+            for k, v in next_query_dict.items()
+            if k not in [*time_config.keys(), "ordering", "page_size"]
+        }
+
+        sleep(1)
+
+        current_page = get_filings_page(time_config, session, next_query_diff)
+
+        print(f"  - PAGE {next_query_diff['page']}")
+
+        all_filings.extend(
+            [
+                process_result(result, type_dict)
+                for result in current_page["body"]["results"]
+            ]
+        )
+
+    return all_filings
+
+
+def scrape_lda_filings(year, time_period, common_session=None):
+    session = BASE_SESSION if common_session is None else common_session
+
+    types_for_period = get_types_for_quarter(time_period, session)
+
+    type_dict = {
+        filing_type["value"]: filing_type["name"]
+        for filing_type in types_for_period
+    }
+
+    all_filings = {}
+
+    for filing_type in types_for_period:
+        print("")
+
+        print(f"{filing_type['name']} ({filing_type['value']}):")
+
+        time_config = dict(
+            filing_year=year,
+            filing_period=TIME_PERIOD_SLUGS[time_period],
+            filing_type=filing_type["value"],
+        )
+
+        all_filings[filing_type["value"]] = collect_filings(
+            time_config,
+            type_dict,
+            session,
+        )
+
+        print("")
+
+    with open(f"json/corporate-lobbying/reports/{year}-{time_period.lower()}.csv", "w") as output_file:
+        writer = DictWriter(
+            output_file,
+            fieldnames=[
+                "UUID",
+                "RegistrantName",
+                "ClientName",
+                "FilingType",
+                "AmountReported",
+                "DatePosted",
+                "FilingYear",
+                "AmountType",
+            ],
+        )
+        writer.writeheader()
+
+        for type_slug, filings_for_type in all_filings.items():
+            for filing in filings_for_type:
+                writer.writerow(filing)
+
+    return all_filings
+
+
+def get_historical_data():
+    current_year = datetime.now().year
+    year_list = list(range(2015, current_year + 1))
+    quarter_list = ['Q1', 'Q2', 'Q3', 'Q4']
+    print(year_list)
+    
+    for year in year_list:
+        for quarter in quarter_list:
+            file_name = f"{year}-{quarter.lower()}"
+            print(file_name)
+            if not os.path.exists(f"json/corporate-lobbying/reports/{file_name}.csv"):
+                scrape_lda_filings(year, quarter)
+            else:
+                print(f"Skipping {file_name}, file already exists.")
+
+def get_current_quarter_and_year():
+    current_date = datetime.now()
+    current_month = current_date.month
+    current_year = str(current_date.year)
+    
+    if 1 <= current_month <= 3:
+        quarter = "Q1"
+    elif 4 <= current_month <= 6:
+        quarter = "Q2"
+    elif 7 <= current_month <= 9:
+        quarter = "Q3"
+    else:
+        quarter = "Q4"
+    
+    return current_year, quarter
+
+def update_latest_quarter():
+    year, quarter = get_current_quarter_and_year()
+    print(year, quarter)
+    scrape_lda_filings(year, quarter)
+
+def save_json(symbol, data):
+    with open(f"json/corporate-lobbying/companies/{symbol}.json", 'w') as file:
+        json.dump(data, file)
+
+def create_dataset():
+    from fuzzywuzzy import process
+    import sqlite3
+    import math
+    from collections import defaultdict
+
+    con = sqlite3.connect('stocks.db')
+    cursor = con.cursor()
+    cursor.execute("PRAGMA journal_mode = wal")
+    cursor.execute("SELECT DISTINCT symbol,name FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
+    stock_data = [{'symbol': row[0], 'name': row[1]} for row in cursor.fetchall()]
+    con.close()
+    # Set a threshold for similarity (0-100)
+    threshold = 95
+
+
+    # Get all CSV files in the reports folder
+    reports_folder = 'json/corporate-lobbying/reports'
+    csv_files = [f for f in os.listdir(reports_folder) if f.endswith('.csv')]
+    
+    for stock in stock_data:
+        all_res_list = []
+        print(stock['name'])
+        for csv_file in csv_files:
+            # Read the CSV file into a DataFrame
+            print(csv_file)
+            df = pd.read_csv(os.path.join(reports_folder, csv_file))
+            
+            # Convert the DataFrame to a list of dictionaries
+            df_list = df.to_dict(orient='records')
+            
+            for item in df_list:
+                company_name = item['ClientName']
+                
+                best_match, score = process.extractOne(stock['name'].lower(), [company_name.lower()])
+                if score >= threshold:
+                    all_res_list.append({'amount': item['AmountReported'], 'year': item['FilingYear']})
+        
+        all_res_list = [item for item in all_res_list if isinstance(item.get("amount"), (int, float)) and not math.isnan(item["amount"])]
+
+        # Group amounts by year
+        year_totals = defaultdict(float)
+        for item in all_res_list:
+            year_totals[item['year']] += item['amount']
+
+        all_res_list = [{'year': year, 'amount': amount} for year, amount in year_totals.items()]
+
+
+        if len(all_res_list) > 0:
+            save_json(stock['symbol'], all_res_list)
+            print(f"Saved data for {stock['symbol']} ({len(all_res_list)} matches)")
+
+if '__main__' == __name__:
+
+    #get_historical_data()
+    #update_latest_quarter()
+    create_dataset()
diff --git a/requirements.txt b/requirements.txt
index c05185a..1f821ff 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,6 @@ finnhub-python
 intrinio_sdk
 openai
 slowapi
-praw
\ No newline at end of file
+praw
+fuzzywuzzy
+python-Levenshtein
\ No newline at end of file