from datetime import timedelta import os import pandas as pd import json from pathlib import Path import time import ujson import sqlite3 def save_json(symbol, data): with open(f"json/fail-to-deliver/companies/{symbol}.json", 'w') as file: ujson.dump(data, file) def get_total_data(files_available, limit=24): """ Combine all the 1/2 monthly csv into 1 large csv file. """ combined_df = pd.DataFrame(columns=["SETTLEMENT DATE", "SYMBOL", "QUANTITY (FAILS)", "PRICE"]) for file in files_available[:limit]: print(f"Processing file: {file}") try: # Read the CSV file with appropriate parameters df = pd.read_csv(file, sep='|', quotechar='"', engine='python') # Safely remove columns if they exist if 'CUSIP' in df.columns: del df['CUSIP'] if 'DESCRIPTION' in df.columns: del df['DESCRIPTION'] # Safely convert SETTLEMENT DATE if 'SETTLEMENT DATE' in df.columns: df['SETTLEMENT DATE'] = pd.to_datetime(df['SETTLEMENT DATE'], format='%Y%m%d', errors='coerce') combined_df = pd.concat([combined_df, df]).drop_duplicates() except pd.errors.ParserError as e: print(f"Error reading {file}: {e}") except Exception as e: print(f"Unexpected error with {file}: {e}") combined_df["SETTLEMENT DATE"] = combined_df["SETTLEMENT DATE"].astype(str) combined_df.rename(columns={ "SETTLEMENT DATE": "date", "SYMBOL": "Ticker", "QUANTITY (FAILS)": "failToDeliver", "PRICE": "price" }, inplace=True) combined_df["T+35 Date"] = pd.to_datetime(combined_df['date'], format='%Y-%m-%d', errors="coerce") + timedelta(days=35) combined_df["T+35 Date"] = combined_df["T+35 Date"].astype(str) combined_df["failToDeliver"] = pd.to_numeric(combined_df["failToDeliver"], errors='coerce') combined_df["failToDeliver"] = combined_df["failToDeliver"].fillna(0).astype(int) combined_df = combined_df[~combined_df["Ticker"].isna()] combined_df.sort_values(by="date", inplace=True) print(combined_df) return combined_df def filter_by_ticker(combined_df): # Group by 'Ticker' column grouped = combined_df.groupby('Ticker') # Dictionary to store DataFrames for each ticker ticker_dfs = {} # Iterate over groups for ticker, group in grouped: # Store each group (DataFrame) in the dictionary ticker_dfs[ticker] = group.copy() # Use .copy() to avoid modifying original DataFrame return ticker_dfs if __name__ == '__main__': con = sqlite3.connect('stocks.db') etf_con = sqlite3.connect('etf.db') cursor = con.cursor() cursor.execute("PRAGMA journal_mode = wal") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'") stock_symbols = [row[0] for row in cursor.fetchall()] etf_cursor = etf_con.cursor() etf_cursor.execute("PRAGMA journal_mode = wal") etf_cursor.execute("SELECT DISTINCT symbol FROM etfs") etf_symbols = [row[0] for row in etf_cursor.fetchall()] con.close() etf_con.close() total_symbols = stock_symbols + etf_symbols # Specify your directory path directory_path = 'json/fail-to-deliver/csv' # List CSV files sorted by modification time files_available = sorted(Path(directory_path).iterdir(), key=os.path.getmtime) combined_df = get_total_data(files_available, limit=1000) ticker_dataframes = filter_by_ticker(combined_df) # Example usage: print or access dataframes for each ticker for ticker, df in ticker_dataframes.items(): if ticker in total_symbols: data= [{k: v for k, v in d.items() if k not in ['Ticker', 'T+35 Date']} for d in df.to_dict('records')] save_json(ticker, data)