import sqlite3 import os import ujson import time from collections import Counter from tqdm import tqdm keys_to_keep = [ "type", "securityName", "symbol", "weight", "changeInSharesNumberPercentage", "sharesNumber", "marketValue", "avgPricePaid", "putCallShare" ] def format_company_name(company_name): remove_strings = [', LLC','LLC', ',', 'LP', 'LTD', 'LTD.', 'INC.', 'INC', '.', '/DE/','/MD/','PLC'] preserve_words = ['FMR','MCF'] remove_strings_set = set(remove_strings) preserve_words_set = set(preserve_words) words = company_name.split() formatted_words = [] for word in words: if word in preserve_words_set: formatted_words.append(word) else: new_word = word for string in remove_strings_set: new_word = new_word.replace(string, '') formatted_words.append(new_word.title()) return ' '.join(formatted_words) def all_hedge_funds(con): # Connect to the SQLite database cursor = con.cursor() cursor.execute("SELECT cik, name, numberOfStocks, marketValue, winRate, turnover, performancePercentage3year FROM institutes") all_ciks = cursor.fetchall() res_list = [{ 'cik': row[0], 'name': format_company_name(row[1]), 'numberOfStocks': row[2], 'marketValue': row[3], 'winRate': row[4], 'turnover': row[5], 'performancePercentage3year': row[6] } for row in all_ciks if row[2] >= 3 and abs(row[6]) < 500] sorted_res_list = sorted(res_list, key=lambda x: x['marketValue'], reverse=True) with open(f"json/hedge-funds/all-hedge-funds.json", 'w') as file: ujson.dump(sorted_res_list, file) def spy_performance(): import pandas as pd import yfinance as yf from datetime import datetime # Define the start date and end date start_date = '1993-01-01' end_date = datetime.today().strftime('%Y-%m-%d') # Generate the range of dates with quarterly frequency date_range = pd.date_range(start=start_date, end=end_date, freq='QE') # Convert the dates to the desired format (end of quarter dates) end_of_quarters = date_range.strftime('%Y-%m-%d').tolist() data = [] df = yf.download('SPY', start='1993-01-01', end=datetime.today(), interval="1d").reset_index() df = df.rename(columns={'Adj Close': 'close', 'Date': 'date'}) df['date'] = df['date'].dt.strftime('%Y-%m-%d') for target_date in end_of_quarters: original_date = target_date # Find close price for '2015-03-31' or the closest available date prior to it while target_date not in df['date'].values: # If the target date doesn't exist, move one day back target_date = (pd.to_datetime(target_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d') # Get the close price for the found or closest date close_price = round(df[df['date'] == target_date]['close'].values[0],2) data.append({'date': original_date, 'price': close_price}) def get_data(cik, stock_sectors): cursor.execute("SELECT cik, name, numberOfStocks, performancePercentage3year, performancePercentage5year, performanceSinceInceptionPercentage, averageHoldingPeriod, turnover, marketValue, winRate, holdings, summary FROM institutes WHERE cik = ?", (cik,)) cik_data = cursor.fetchall() res = [{ 'cik': row[0], 'name': row[1], 'numberOfStocks': row[2], 'performancePercentage3year': row[3], 'performancePercentage5year': row[4], 'performanceSinceInceptionPercentage': row[5], 'averageHoldingPeriod': row[6], 'turnover': row[7], 'marketValue': row[8], 'winRate': row[9], 'holdings': ujson.loads(row[10]), 'summary': ujson.loads(row[11]), } for row in cik_data] if not res: return None # Exit if no data is found res = res[0] #latest data filtered_holdings = [ {key: holding[key] for key in keys_to_keep} for holding in res['holdings'] ] res['holdings'] = filtered_holdings # Cross-reference symbols in holdings with stock_sectors to determine sectors sector_counts = Counter() for holding in res['holdings']: symbol = holding['symbol'] sector = next((item['sector'] for item in stock_sectors if item['symbol'] == symbol), None) if sector: sector_counts[sector] += 1 # Calculate the total number of holdings total_holdings = sum(sector_counts.values()) # Calculate the percentage for each sector and get the top 5 top_5_sectors_percentage = [ {sector: round((count / total_holdings) * 100, 2)} for sector, count in sector_counts.most_common(5) ] # Add the top 5 sectors information to the result res['topSectors'] = top_5_sectors_percentage if res: with open(f"json/hedge-funds/companies/{cik}.json", 'w') as file: ujson.dump(res, file) if __name__ == '__main__': con = sqlite3.connect('institute.db') stock_con = sqlite3.connect('stocks.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") cursor.execute("SELECT DISTINCT cik FROM institutes") cik_symbols = [row[0] for row in cursor.fetchall()] try: stock_cursor = stock_con.cursor() stock_cursor.execute("SELECT DISTINCT symbol, sector FROM stocks") stock_sectors = [{'symbol': row[0], 'sector': row[1]} for row in stock_cursor.fetchall()] finally: # Ensure that the cursor and connection are closed even if an error occurs stock_cursor.close() stock_con.close() all_hedge_funds(con) spy_performance() for cik in tqdm(cik_symbols): try: get_data(cik, stock_sectors) except Exception as e: print(e) con.close()