backend/app/stats.py
2024-05-26 22:28:08 +02:00

337 lines
13 KiB
Python
Executable File

import quantstats as qs
from datetime import datetime
import pandas as pd
import sqlite3
from math import sqrt, ceil
from dateutil.relativedelta import relativedelta
import time
import json
from tqdm import tqdm
import concurrent.futures
import numpy as np
import argparse
pd.set_option('display.max_rows', 150)
def parse_args():
parser = argparse.ArgumentParser(description='Process stock, etf or crypto data.')
parser.add_argument('--db', choices=['stocks', 'etf', 'crypto'], required=True, help='Database name (stocks or etf)')
parser.add_argument('--table', choices=['stocks', 'etfs', 'cryptos'], required=True, help='Table name (stocks or etfs)')
return parser.parse_args()
# Define a function to get the ticker from the database
def get_ticker_data_from_database(database_path, sp500_ticker, start_date, end_date):
con_etf = sqlite3.connect(database_path)
# Fetch data for the selected ticker (SPY or another ticker)
query_template = """
SELECT
date, close
FROM
"{ticker}"
WHERE
date BETWEEN ? AND ?
"""
query = query_template.format(ticker=sp500_ticker)
df = pd.read_sql_query(query, con_etf, params=(start_date, end_date))
df['date'] = pd.to_datetime(df['date'])
df = df.rename(columns={'date': 'Date'})
df[sp500_ticker] = df['close'].pct_change()
df.set_index("Date", inplace=True)
df.drop(columns=['close'], inplace=True)
con_etf.close()
return sp500_ticker, df
class Quant_Stats:
def __init__(self):
pass
def get_trading_periods(self):
periods_per_year = 252
half_year = ceil(periods_per_year / 2)
return periods_per_year, half_year
def get_data(self, df, ticker):
benchmark = "SPY"
compounded = True
rf = 0
today = df.index[-1]
comp_func = qs.stats.comp
win_year, win_half_year = self.get_trading_periods()
metrics = pd.DataFrame()
metrics['Expected Daily %'] = round(qs.stats.expected_return(df, compounded=True)*100,2)
metrics['Expected Monthly %'] = round(qs.stats.expected_return(df, compounded=True, aggregate="M")*100,2)
metrics['Expected Yearly %'] = round(qs.stats.expected_return(df, compounded=True, aggregate="A")*100,2)
metrics["Cumulative Return %"] = round(qs.stats.comp(df) * 100, 2)
metrics["CAGR %"] = round(qs.stats.cagr(df, rf, compounded) * 100, 2)
metrics["Sharpe"] = qs.stats.sharpe(df, rf, win_year, compounded)
metrics["Sortino"] = qs.stats.sortino(df, rf, win_year, True)
metrics["Volatility (ann.) %"] = round(qs.stats.volatility(df, win_year, True)* 100, 2)
metrics["Calmar"] = round(qs.stats.calmar(df),2)
metrics["Skew"] = qs.stats.skew(df, prepare_returns=False)
metrics["Kurtosis"] = qs.stats.kurtosis(df, prepare_returns=False)
metrics["Kelly Criterion %"] = round(qs.stats.kelly_criterion(df, prepare_returns=False) * 100, 2)
metrics["Risk of Ruin %"] = round(qs.stats.risk_of_ruin(df, prepare_returns=False), 2)
metrics["Daily Value-at-Risk %"] = -abs(qs.stats.var(df, prepare_returns=False) * 100)
metrics["Expected Shortfall (cVaR) %"] = -abs(qs.stats.cvar(df, prepare_returns=False) * 100)
metrics["Max Consecutive Wins"] = qs.stats.consecutive_wins(df)
metrics["Max Consecutive Losses"] = qs.stats.consecutive_losses(df)
metrics["Gain/Pain Ratio"] = qs.stats.gain_to_pain_ratio(df, rf)
metrics["Gain/Pain (1M)"] = qs.stats.gain_to_pain_ratio(df, rf, "M")
metrics["Payoff Ratio"] = qs.stats.payoff_ratio(df, prepare_returns=False)
metrics["Profit Factor"] = qs.stats.profit_factor(df, prepare_returns=False)
metrics["Common Sense Ratio"] = qs.stats.common_sense_ratio(df, prepare_returns=False)
metrics["CPC Index"] = qs.stats.cpc_index(df, prepare_returns=False)
metrics["Tail Ratio"] = qs.stats.tail_ratio(df, prepare_returns=False)
metrics["Outlier Win Ratio"] = qs.stats.outlier_win_ratio(df, prepare_returns=False)
metrics["Outlier Loss Ratio"] = qs.stats.outlier_loss_ratio(df, prepare_returns=False)
#Yearly return is included since eoy = end of the year True
ticker_monthly_returns = round(qs.stats.monthly_returns(df[ticker], eoy = True, compounded = True) * 100,2)
benchmark_monthly_returns = round(qs.stats.monthly_returns(df[benchmark], eoy = True, compounded = True) * 100,2)
metrics['Monthly Return'] = [ticker_monthly_returns.T.to_dict('list'), benchmark_monthly_returns.T.to_dict('list')]
metrics["MTD %"] = round(comp_func(df[df.index >= datetime(today.year, today.month, 1)]) * 100,2)
d = today - relativedelta(months=3)
metrics["3M %"] = comp_func(df[df.index >= d]) * 100
d = today - relativedelta(months=6)
metrics["6M %"] = comp_func(df[df.index >= d]) * 100
metrics["YTD %"] = comp_func(df[df.index >= datetime(today.year, 1, 1)]) * 100
d = today - relativedelta(years=1)
metrics["1Y %"] = comp_func(df[df.index >= d]) * 100
d = today - relativedelta(months=35)
metrics["3Y (ann.) %"] = qs.stats.cagr(df[df.index >= d], 0.0, compounded) * 100
d = today - relativedelta(months=59)
metrics["5Y (ann.) %"] = qs.stats.cagr(df[df.index >= d], 0.0, compounded) * 100
d = today - relativedelta(years=10)
metrics["10Y (ann.) %"] = qs.stats.cagr(df[df.index >= d], 0.0, compounded) * 100
metrics["All-time (ann.) %"] = qs.stats.cagr(df, 0.0, compounded) * 100
metrics["Best Day %"] = qs.stats.best(df, compounded=compounded, prepare_returns=False) * 100
metrics["Worst Day %"] = qs.stats.worst(df, prepare_returns=False) * 100
metrics["Best Month %"] = (qs.stats.best(df, compounded=compounded, aggregate="M", prepare_returns=False) * 100)
metrics["Worst Month %"] = (qs.stats.worst(df, aggregate="M", prepare_returns=False) * 100)
metrics["Best Year %"] = (qs.stats.best(df, compounded=compounded, aggregate="A", prepare_returns=False) * 100)
metrics["Worst Year %"] = (qs.stats.worst(df, compounded=compounded, aggregate="A", prepare_returns=False) * 100)
avg_dd_list = []
avg_dd_days_list = []
max_dd_list = []
longest_dd_days_list = []
for tt in [ticker, benchmark]:
dd = qs.stats.to_drawdown_series(df[tt])
dd_info = qs.stats.drawdown_details(dd).sort_values(by="max drawdown", ascending = True)
dd_info = dd_info[["start", "end", "max drawdown", "days"]]
dd_info.columns = ["Started", "Recovered", "Drawdown", "Days"]
avg_dd_list.append(round(dd_info['Drawdown'].mean(),2))
max_dd_list.append(round(dd_info['Drawdown'].min(),2))
avg_dd_days_list.append(round(dd_info['Days'].mean()))
longest_dd_days_list.append(round(dd_info['Days'].max()))
metrics["Max Drawdown"] = max_dd_list
metrics["Avg. Drawdown"] = avg_dd_list
metrics["Longest DD Days"] = longest_dd_days_list
metrics["Avg. Drawdown Days"] = avg_dd_days_list
worst_dd_list = []
dd = qs.stats.to_drawdown_series(df[ticker])
dd_info = qs.stats.drawdown_details(dd).sort_values(by="max drawdown", ascending = True)[0:10]
dd_info = dd_info[["start", "end", "max drawdown", "days"]]
dd_info.columns = ["Started", "Recovered", "Drawdown", "Days"]
for key, value in dd_info.T.to_dict().items():
worst_dd_list.append(value)
metrics['Worst 10 Drawdowns'] = [worst_dd_list, '-']
metrics["Recovery Factor"] = qs.stats.recovery_factor(df)
metrics["Ulcer Index"] = qs.stats.ulcer_index(df)
metrics["Serenity Index"] = qs.stats.serenity_index(df, rf)
metrics["Avg. Up Month %"] = (qs.stats.avg_win(df, compounded=compounded, aggregate="M", prepare_returns=False) * 100)
metrics["Avg. Down Month %"] = (qs.stats.avg_loss(df, compounded=compounded, aggregate="M", prepare_returns=False) * 100)
metrics["Win Days %"] = qs.stats.win_rate(df, prepare_returns=False) * 100
metrics["Win Month %"] = (qs.stats.win_rate(df, compounded=compounded, aggregate="M", prepare_returns=False) * 100)
metrics["Win Quarter %"] = (qs.stats.win_rate(df, compounded=compounded, aggregate="Q", prepare_returns=False) * 100)
metrics["Win Year %"] = (qs.stats.win_rate(df, compounded=compounded, aggregate="A", prepare_returns=False) * 100)
greeks = qs.stats.greeks(df[ticker], df[benchmark], win_year, prepare_returns=False)
metrics["Beta"] = [round(greeks["beta"], 2), "-"]
metrics["Alpha"] = [round(greeks["alpha"], 2), "-"]
metrics["Correlation"] = [round(df[benchmark].corr(df[ticker]) * 100, 2), "-",]
metrics["Treynor Ratio"] = [round(qs.stats.treynor_ratio(df[ticker], df[benchmark], win_year, rf) * 100, 2,), "-" ]
metrics["R^2"] = ([qs.stats.r_squared(df[ticker], df[benchmark], prepare_returns=False ).round(2), "-"])
metrics["Start Period"] = df.index[0].strftime("%Y-%m-%d")
metrics['End Period'] = df.index[-1].strftime("%Y-%m-%d")
metrics = metrics.T
return metrics
def create_quantstats_column(con):
"""
Create the 'quantStats' column if it doesn't exist in the db table.
"""
query_check = f"PRAGMA table_info({table_name})"
cursor = con.execute(query_check)
columns = [col[1] for col in cursor.fetchall()]
if 'quantStats' not in columns:
query = f"ALTER TABLE {table_name} ADD COLUMN quantStats TEXT"
con.execute(query)
con.commit()
def update_database_with_stats(stats_dict, symbol, con):
"""
Update the SQLite3 table with calculated statistics for a given symbol.
"""
query = f"UPDATE {table_name} SET quantStats = ? WHERE symbol = ?"
stats_json = json.dumps(stats_dict) # Convert the stats dictionary to JSON string
con.execute(query, (stats_json, symbol))
con.commit()
def process_symbol(ticker, sp500_ticker, sp500_df):
df = pd.DataFrame()
combined_df = pd.DataFrame()
try:
query_template = """
SELECT
date, close
FROM
"{ticker}"
WHERE
date BETWEEN ? AND ?
"""
query = query_template.format(ticker=ticker)
df = pd.read_sql_query(query, con, params=(start_date, end_date))
df['date'] = pd.to_datetime(df['date'])
df = df.rename(columns={'date': 'Date'})
df[ticker] = df['close'].pct_change()
df.set_index("Date", inplace=True)
df.drop(columns=['close'], inplace=True)
combined_df = pd.concat([sp500_df, df], axis=1)
df = combined_df.dropna()
df = df[[ticker, sp500_ticker]]
stats = Quant_Stats().get_data(df, ticker)
stats_dict = stats.to_dict()
create_quantstats_column(con)
update_database_with_stats(stats_dict, ticker, con)
except Exception as e:
print(e)
print(f"Failed create quantStats for {ticker}")
#Production Code
args = parse_args()
db_name = args.db
table_name = args.table
start_date = datetime(1970, 1, 1)
end_date = datetime.today()
con = sqlite3.connect(f'backup_db/{db_name}.db')
# Load the S&P 500 ticker from the database
sp500_ticker, sp500_df = get_ticker_data_from_database('backup_db/etf.db', "SPY", start_date, end_date)
symbol_query = f"SELECT DISTINCT symbol FROM {table_name}"
symbol_cursor = con.execute(symbol_query)
symbols = [symbol[0] for symbol in symbol_cursor.fetchall()]
# Number of concurrent workers
num_processes = 4 # You can adjust this based on your system's capabilities
futures = []
with concurrent.futures.ProcessPoolExecutor(max_workers=num_processes) as executor:
for symbol in symbols:
futures.append(executor.submit(process_symbol, symbol, sp500_ticker, sp500_df))
# Use tqdm to wrap around the futures for progress tracking
for future in tqdm(concurrent.futures.as_completed(futures), total=len(symbols), desc="Processing"):
pass
con.close()
#Test Code
'''
con = sqlite3.connect('backup_db/etf.db')
start_date = datetime(1970, 1, 1)
end_date = datetime.today()
query_template = """
SELECT
date, close
FROM
"{ticker}"
WHERE
date BETWEEN ? AND ?
"""
ticker_list = ['IVV','SPY']
combined_df = pd.DataFrame()
for ticker in ticker_list:
query = query_template.format(ticker=ticker)
df = pd.read_sql_query(query, con, params=(start_date, end_date))
print(df)
df['date'] = pd.to_datetime(df['date'])
df = df.rename(columns={'date': 'Date'})
df[ticker] = df['close'].pct_change()
df.set_index("Date", inplace=True)
df.drop(columns=['close'], inplace=True)
combined_df = pd.concat([combined_df, df], axis=1)
df = combined_df.dropna()
#monthly_returns = round(qs.stats.monthly_returns(df[ticker], eoy = False, compounded = True) * 100,2)
#yearly_returns = round(qs.stats.monthly_returns(df[ticker], eoy = True, compounded = True) * 100,2)
#print(yearly_returns)
#stats = Quant_Stats().get_data(df, ticker)
#print(stats)
con.close()
'''