bugfixing iv

2025-02-11 00:10:38 +01:00 · 2025-02-11 00:10:38 +01:00 · 3c3b7a6e45
commit 3c3b7a6e45
parent abbf7dc579
3 changed files with 203 additions and 164 deletions
--- a/app/cron_implied_volatility.py
+++ b/app/cron_implied_volatility.py
@ -62,11 +62,89 @@ def save_json(data, symbol):
        file.write(orjson.dumps(serializable_data))


+def is_outlier(value, values, n_sigmas=3):
+    """
+    Detect if a value is an outlier using the z-score method
+    
+    Args:
+        value: The value to check
+        values: List of values to compare against
+        n_sigmas: Number of standard deviations to use as threshold (default: 3)
+    
+    Returns:
+        bool: True if the value is an outlier, False otherwise
+    """
+    if value is None or not values:
+        return False
+    
+    values = [v for v in values if v is not None]
+    if not values:
+        return False
+
+    mean = np.mean(values)
+    std = np.std(values)
+    
+    if std == 0:
+        return False
+        
+    z_score = abs((value - mean) / std)
+    return z_score > n_sigmas
+
+def clean_iv_data(data):
+    """
+    Clean IV data by handling outliers
+    
+    Args:
+        data: List of dictionaries containing IV values
+        
+    Returns:
+        List of dictionaries with cleaned IV values
+    """
+    # Extract IV values
+    iv_values = [item.get('iv') for item in data]
+    
+    # Create a copy of the data to modify
+    cleaned_data = []
+    
+    window_size = 20  # Rolling window size for outlier detection
+    
+    for i, item in enumerate(data):
+        cleaned_item = item.copy()
+        iv = item.get('iv')
+        
+        if iv is not None:
+            # Get a window of IV values centered around the current point
+            start_idx = max(0, i - window_size // 2)
+            end_idx = min(len(data), i + window_size // 2)
+            window_values = [data[j].get('iv') for j in range(start_idx, end_idx)]
+            
+            # Check if the current IV is an outlier
+            if is_outlier(iv, window_values):
+                # Replace outlier with the median of nearby non-outlier values
+                non_outlier_values = [
+                    v for v in window_values 
+                    if v is not None and not is_outlier(v, window_values)
+                ]
+                
+                if non_outlier_values:
+                    cleaned_item['iv'] = round(np.median(non_outlier_values), 2)
+                else:
+                    cleaned_item['iv'] = None
+            else:
+                cleaned_item['iv'] = round(iv, 2)
+                
+        cleaned_data.append(cleaned_item)
+    
+    return cleaned_data
+
 def compute_realized_volatility(data, window_size=20):
    """
    Compute the realized volatility of stock prices over a rolling window.
    Realized volatility is the annualized standard deviation of log returns of stock prices.
    """
+    # First clean the IV data
+    data = clean_iv_data(data)
+    
    # Sort data by date (oldest first)
    data = sorted(data, key=lambda x: x['date'])
    
@ -143,6 +221,7 @@ if __name__ == '__main__':
    directory_path = "json/implied-volatility"
    total_symbols = stocks_symbols + etf_symbols + index_symbols

+
    for symbol in tqdm(total_symbols):
        try:
            with open(f"json/options-historical-data/companies/{symbol}.json", "r") as file:
--- a/app/cron_options_historical_volume.py
+++ b/app/cron_options_historical_volume.py
@ -1,12 +1,9 @@
-import requests
 import orjson
-import re
 from datetime import datetime,timedelta
 from dotenv import load_dotenv
 import os
 import sqlite3
 import pandas as pd
-import time
 from tqdm import tqdm
 from collections import defaultdict

@ -36,8 +33,57 @@ def safe_round(value, decimals=2):


 def aggregate_data_by_date(symbol):
-    data_by_date = defaultdict(lambda: {
-        "date": "",
+    # Pre-load price data and create lookup dictionary for better performance
+    with open(f"json/historical-price/max/{symbol}.json", "r") as file:
+        price_list = {p['time']: p['close'] for p in orjson.loads(file.read())}
+    
+    # Use dict instead of defaultdict for better performance
+    data_by_date = {}
+    
+    today = datetime.today().date()
+    one_year_ago = today - timedelta(days=365)
+    one_year_ago_str = one_year_ago.strftime('%Y-%m-%d')
+    
+    contract_dir = f"json/all-options-contracts/{symbol}"
+    contract_list = get_contracts_from_directory(contract_dir)
+
+    if not contract_list:
+        return []
+
+    for item in tqdm(contract_list):
+        try:
+            file_path = os.path.join(contract_dir, f"{item}.json")
+            with open(file_path, "r") as file:
+                data = orjson.loads(file.read())
+            
+            option_type = data.get('optionType')
+            if option_type not in ['call', 'put']:
+                continue
+            
+            is_call = option_type == 'call'
+            
+            for entry in data.get('history', []):
+                date = entry.get('date')
+                #if date < one_year_ago_str:
+                #    continue
+                
+                spot_price = price_list.get(date)
+                if not spot_price:
+                    continue
+
+                volume = entry.get('volume', 0) or 0
+                open_interest = entry.get('open_interest', 0) or 0
+                total_premium = entry.get('total_premium', 0) or 0
+                implied_volatility = entry.get('implied_volatility', 0) or 0
+                gamma = entry.get('gamma', 0) or 0
+                delta = entry.get('delta', 0) or 0
+
+                gex = open_interest * gamma * spot_price
+                dex = open_interest * delta * spot_price
+
+                if date not in data_by_date:
+                    data_by_date[date] = {
+                        "date": date,
                        "call_volume": 0,
                        "put_volume": 0,
                        "call_open_interest": 0,
@ -48,105 +94,43 @@ def aggregate_data_by_date(symbol):
                        "put_gex": 0,
                        "call_dex": 0,
                        "put_dex": 0,
-        "iv": 0.0,  # Sum of implied volatilities
-        "iv_count": 0,  # Count of entries for IV
-    })
-    
-    # Calculate cutoff date (1 year ago)
-    today = datetime.today().date()
-    one_year_ago = today - timedelta(days=365)
-    one_year_ago_str = one_year_ago.strftime('%Y-%m-%d')
-    
-    contract_dir = f"json/all-options-contracts/{symbol}"
-    contract_list = get_contracts_from_directory(contract_dir)
-
-    with open(f"json/historical-price/max/{symbol}.json","r") as file:
-        price_list = orjson.loads(file.read())
-
-    if len(contract_list) > 0:
-        for item in tqdm(contract_list):
-            try:
-                file_path = os.path.join(contract_dir, f"{item}.json")
-                with open(file_path, "r") as file:
-                    data = orjson.loads(file.read())
-                
-                option_type = data.get('optionType', None)
-                if option_type not in ['call', 'put']:
-                    continue
-                
-                for entry in data.get('history', []):
-                    date = entry.get('date')
-
-                    # Skip entries older than one year
-                    if date < one_year_ago_str:
-                        continue
-                    
-                    volume = entry.get('volume', 0) or 0
-                    open_interest = entry.get('open_interest', 0) or 0
-                    total_premium = entry.get('total_premium', 0) or 0
-                    implied_volatility = entry.get('implied_volatility', 0) or 0
-                    gamma = entry.get('gamma',0) or 0
-                    delta = entry.get('delta',0) or 0
-
-                    # Find the matching date in price_list
-                    matching_price = next((p for p in price_list if p.get('time') == date), 0)
-
-                    if matching_price:
-                        spot_price = matching_price['close']
-                    else:
-                        continue
-
-                    gex = open_interest * gamma * spot_price
-                    dex = open_interest * delta * spot_price
-
+                        "iv": [],
+                        "iv_count": 0,
+                    }

                daily_data = data_by_date[date]
-                    daily_data["date"] = date
                
-                    if option_type == 'call':
-                        daily_data["call_volume"] += int(volume)
-                        daily_data["call_open_interest"] += int(open_interest)
-                        daily_data["call_premium"] += int(total_premium)
-                        daily_data["call_gex"] += round(gex,2)
-                        daily_data["call_dex"] += round(dex,2)
-                    elif option_type == 'put':
-                        daily_data["put_volume"] += int(volume)
-                        daily_data["put_open_interest"] += int(open_interest)
-                        daily_data["put_premium"] += int(total_premium)
-                        daily_data["put_gex"] += round(gex,2)
-                        daily_data["put_dex"] += round(dex,2)
+                # Use conditional indexing instead of if-else
+                type_prefix = 'call_' if is_call else 'put_'
+                daily_data[f"{type_prefix}volume"] += int(volume)
+                daily_data[f"{type_prefix}open_interest"] += int(open_interest)
+                daily_data[f"{type_prefix}premium"] += int(total_premium)
+                daily_data[f"{type_prefix}gex"] += round(gex, 2)
+                daily_data[f"{type_prefix}dex"] += round(dex, 2)
                
-                    # Aggregate IV for both calls and puts
-                    daily_data["iv"] += round(implied_volatility, 2)
+                daily_data["iv"].append(round(implied_volatility, 2))
                daily_data["iv_count"] += 1
                
-                    # Calculate put/call ratio
                try:
                    daily_data["putCallRatio"] = round(daily_data["put_volume"] / daily_data["call_volume"], 2)
                except ZeroDivisionError:
                    daily_data["putCallRatio"] = None
        
-            except Exception as e:
-                print(f"Error processing {item}: {e}")
+        except:
            continue

-        # Convert to list and calculate average IV
-        data = []
-        for date, daily in data_by_date.items():
-            if daily['iv_count'] > 0:
-                daily['iv'] = round(daily['iv'] / daily['iv_count'], 2)
-            else:
-                daily['iv'] = None
-            data.append(daily)
+    # Convert to list and calculate median IV
+    data = list(data_by_date.values())
+    
+    # Use vectorized operations with pandas for IV calculations
+    df = pd.DataFrame(data)
+    df['iv'] = df.apply(lambda x: round(float(pd.Series(x['iv']).median()), 2) if x['iv_count'] > 0 else None, axis=1)
    
    # Sort and calculate IV Rank
+    data = df.to_dict('records')
    data = sorted(data, key=lambda x: x['date'])
    data = calculate_iv_rank_for_all(data)
-        data = sorted(data, key=lambda x: x['date'], reverse=True)
-
-        return data
-    else:
-        return []
+    return sorted(data, key=lambda x: x['date'], reverse=True)

 def calculate_iv_rank_for_all(data):
    if not data:
@ -199,87 +183,64 @@ def calculate_iv_rank_for_all(data):


 def prepare_data(data, symbol):
-    
+    # Filter data first to reduce processing
    data = [entry for entry in data if entry['call_volume'] != 0 or entry['put_volume'] != 0]
+    if not data:
+        return
    
    start_date_str = data[-1]['date']
    end_date_str = data[0]['date']

-    query = query_template.format(ticker=symbol)
-    if symbol in stocks_symbols:
-        query_con = con
-    elif symbol in etf_symbols:
-        query_con = etf_con
-    else:
-        query_con = index_con
+    # Determine query connection
+    query_con = (con if symbol in stocks_symbols else 
+                etf_con if symbol in etf_symbols else 
+                index_con)
+
+    # Use pandas efficient reading and processing
+    df_price = pd.read_sql_query(
+        query_template.format(ticker=symbol),
+        query_con,
+        params=(start_date_str, end_date_str)
+    ).round(2)

-    df_price = pd.read_sql_query(query, query_con, params=(start_date_str, end_date_str)).round(2)
    df_price = df_price.rename(columns={"change_percent": "changesPercentage"})
-
-    # Convert the DataFrame to a dictionary for quick lookups by date
-    df_change_dict = df_price.set_index('date')['changesPercentage'].to_dict()
-    df_close_dict = df_price.set_index('date')['close'].to_dict()
+    price_lookup = df_price.set_index('date').to_dict('index')

    res_list = []
-
    for item in data:
        try:
-            # Round numerical and numerical-string values
            new_item = {
                key: safe_round(value) if isinstance(value, (int, float, str)) else value
                for key, value in item.items()
            }

-            # Add parsed fields
-            new_item['volume'] = round(new_item['call_volume'] + new_item['put_volume'], 2)
-            new_item['putCallRatio'] = round(new_item['put_volume']/new_item['call_volume'],2)
-            #new_item['avgVolumeRatio'] = round(new_item['volume'] / (round(new_item['avg_30_day_call_volume'] + new_item['avg_30_day_put_volume'], 2)), 2)
-            new_item['total_premium'] = round(new_item['call_premium'] + new_item['put_premium'], 2)
-            #new_item['net_premium'] = round(new_item['net_call_premium'] - new_item['net_put_premium'],2)
-            new_item['total_open_interest'] = round(new_item['call_open_interest'] + new_item['put_open_interest'], 2)
+            # Calculate derived fields
+            new_item.update({
+                'volume': new_item['call_volume'] + new_item['put_volume'],
+                'putCallRatio': round(new_item['put_volume'] / new_item['call_volume'], 2),
+                'total_premium': new_item['call_premium'] + new_item['put_premium'],
+                'total_open_interest': new_item['call_open_interest'] + new_item['put_open_interest']
+            })

-            
-            #bearish_premium = float(item['bearish_premium'])
-            #bullish_premium = float(item['bullish_premium'])
-            #neutral_premium = calculate_neutral_premium(item)
-            '''
-            new_item['premium_ratio'] = [
-                safe_round(bearish_premium),
-                neutral_premium,
-                safe_round(bullish_premium)
-            ]
-            '''
-
-            # Add changesPercentage if the date exists in df_change_dict
-            if item['date'] in df_change_dict:
-                new_item['changesPercentage'] = float(df_change_dict[item['date']])
+            # Get price data from lookup
+            if price_data := price_lookup.get(item['date']):
+                new_item['changesPercentage'] = float(price_data['changesPercentage'])
+                new_item['price'] = float(price_data['close'])
            else:
                new_item['changesPercentage'] = None
-
-            if item['date'] in df_close_dict:
-                new_item['price'] = float(df_close_dict[item['date']])
-            else:
                new_item['price'] = None

            res_list.append(new_item)
        except:
-            pass
+            continue

+    # Calculate OI changes using vectorized operations
+    df = pd.DataFrame(res_list)
+    df = df.sort_values('date')
+    df['changeOI'] = df['total_open_interest'].diff()
+    df['changesPercentageOI'] = (df['total_open_interest'].pct_change() * 100).round(2)
    
-    res_list = sorted(res_list, key=lambda x: x['date'])
-
-    for i in range(1, len(res_list)):
-        try:
-            current_open_interest = res_list[i]['total_open_interest']
-            previous_open_interest = res_list[i-1]['total_open_interest']
-            changes_percentage_oi = round((current_open_interest/previous_open_interest -1)*100,2)
-            res_list[i]['changesPercentageOI'] = changes_percentage_oi
-            res_list[i]['changeOI'] = current_open_interest-previous_open_interest
-        except:
-            res_list[i]['changesPercentageOI'] = None
-            res_list[i]['changeOI'] = None
-
-    res_list = sorted(res_list, key=lambda x: x['date'],reverse=True)
+    res_list = df.sort_values('date', ascending=False).to_dict('records')
    
    if res_list:
        save_json(res_list, symbol)
--- a/app/cron_options_single_contract.py
+++ b/app/cron_options_single_contract.py
@ -57,7 +57,6 @@ intrinio.ApiClient().allow_retries(True)

 after = (datetime.today()- timedelta(days=365)).strftime('%Y-%m-%d')
 before = '2100-12-31'
-N_year_ago = datetime.now() - timedelta(days=365)
 include_related_symbols = False
 page_size = 5000
 MAX_CONCURRENT_REQUESTS = 100  # Adjust based on API rate limits