update analyst cron job
This commit is contained in:
parent
5e802c117a
commit
144a79f00a
@ -12,12 +12,9 @@ from tqdm import tqdm
|
||||
import concurrent.futures
|
||||
import re
|
||||
from itertools import combinations
|
||||
|
||||
from ta.momentum import *
|
||||
from ta.trend import *
|
||||
from ta.volatility import *
|
||||
from ta.volume import *
|
||||
import os
|
||||
import gc
|
||||
from utils.feature_engineering import *
|
||||
#Enable automatic garbage collection
|
||||
gc.enable()
|
||||
|
||||
@ -25,288 +22,262 @@ async def save_json(symbol, data):
|
||||
with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file:
|
||||
file.write(orjson.dumps(data))
|
||||
|
||||
def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75):
|
||||
# Drop the columns to exclude from the DataFrame
|
||||
df_filtered = df.drop(columns=['date','price'])
|
||||
|
||||
def trend_intensity(close, window=20):
|
||||
ma = close.rolling(window=window).mean()
|
||||
std = close.rolling(window=window).std()
|
||||
return ((close - ma) / std).abs().rolling(window=window).mean()
|
||||
# Compute the correlation matrix
|
||||
correlation_matrix = df_filtered.corr()
|
||||
|
||||
# Get the correlations with the target column, sorted by absolute value
|
||||
correlations_with_target = correlation_matrix[target_col].drop(target_col).abs().sort_values(ascending=False)
|
||||
|
||||
def calculate_fdi(high, low, close, window=30):
|
||||
n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
|
||||
np.log(close.rolling(window=window).max() - close.rolling(window=window).min())) / np.log(2)
|
||||
return (2 - n1) * 100
|
||||
# Initialize the list of selected features
|
||||
selected_features = []
|
||||
|
||||
# Iteratively select the most correlated features while minimizing correlation with each other
|
||||
for feature in correlations_with_target.index:
|
||||
# If we already have enough features, break
|
||||
if len(selected_features) >= top_n:
|
||||
break
|
||||
|
||||
# Check correlation of this feature with already selected features
|
||||
is_uncorrelated = True
|
||||
for selected in selected_features:
|
||||
if abs(correlation_matrix.loc[feature, selected]) > threshold:
|
||||
is_uncorrelated = False
|
||||
break
|
||||
|
||||
# If it's uncorrelated with the selected features, add it to the list
|
||||
if is_uncorrelated:
|
||||
selected_features.append(feature)
|
||||
return selected_features
|
||||
|
||||
async def download_data(ticker, con, start_date, end_date):
|
||||
try:
|
||||
# Define paths to the statement files
|
||||
statements = [
|
||||
f"json/financial-statements/ratios/quarter/{ticker}.json",
|
||||
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
|
||||
f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/income-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
|
||||
]
|
||||
|
||||
# Helper function to load JSON data asynchronously
|
||||
async def load_json_from_file(path):
|
||||
async with aiofiles.open(path, 'r') as f:
|
||||
content = await f.read()
|
||||
return orjson.loads(content)
|
||||
file_path = f"ml_models/training_data/ai-score/{ticker}.json"
|
||||
|
||||
# Helper function to filter data based on keys and year
|
||||
async def filter_data(data, ignore_keys, year_threshold=2000):
|
||||
return [{k: v for k, v in item.items() if k not in ignore_keys} for item in data if int(item["date"][:4]) >= year_threshold]
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
return pd.DataFrame(orjson.loads(file.read()))
|
||||
else:
|
||||
|
||||
# Define keys to ignore
|
||||
ignore_keys = ["symbol", "reportedCurrency", "calendarYear", "fillingDate", "acceptedDate", "period", "cik", "link", "finalLink","pbRatio","ptbRatio"]
|
||||
try:
|
||||
# Define paths to the statement files
|
||||
statements = [
|
||||
f"json/financial-statements/ratios/quarter/{ticker}.json",
|
||||
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
|
||||
f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/income-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
|
||||
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
|
||||
f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
|
||||
]
|
||||
|
||||
# Load and filter data for each statement type
|
||||
# Helper function to load JSON data asynchronously
|
||||
async def load_json_from_file(path):
|
||||
async with aiofiles.open(path, 'r') as f:
|
||||
content = await f.read()
|
||||
return orjson.loads(content)
|
||||
|
||||
ratios = await load_json_from_file(statements[0])
|
||||
ratios = await filter_data(ratios, ignore_keys)
|
||||
# Helper function to filter data based on keys and year
|
||||
async def filter_data(data, ignore_keys, year_threshold=2000):
|
||||
return [{k: v for k, v in item.items() if k not in ignore_keys} for item in data if int(item["date"][:4]) >= year_threshold]
|
||||
|
||||
#Threshold of enough datapoints needed!
|
||||
if len(ratios) < 50:
|
||||
return
|
||||
# Define keys to ignore
|
||||
ignore_keys = ["symbol", "reportedCurrency", "calendarYear", "fillingDate", "acceptedDate", "period", "cik", "link", "finalLink","pbRatio","ptbRatio"]
|
||||
|
||||
key_metrics = await load_json_from_file(statements[1])
|
||||
key_metrics = await filter_data(key_metrics, ignore_keys)
|
||||
# Load and filter data for each statement type
|
||||
|
||||
ratios = await load_json_from_file(statements[0])
|
||||
ratios = await filter_data(ratios, ignore_keys)
|
||||
|
||||
#Threshold of enough datapoints needed!
|
||||
if len(ratios) < 50:
|
||||
return
|
||||
|
||||
key_metrics = await load_json_from_file(statements[1])
|
||||
key_metrics = await filter_data(key_metrics, ignore_keys)
|
||||
|
||||
|
||||
cashflow = await load_json_from_file(statements[2])
|
||||
cashflow = await filter_data(cashflow, ignore_keys)
|
||||
cashflow = await load_json_from_file(statements[2])
|
||||
cashflow = await filter_data(cashflow, ignore_keys)
|
||||
|
||||
income = await load_json_from_file(statements[3])
|
||||
income = await filter_data(income, ignore_keys)
|
||||
income = await load_json_from_file(statements[3])
|
||||
income = await filter_data(income, ignore_keys)
|
||||
|
||||
balance = await load_json_from_file(statements[4])
|
||||
balance = await filter_data(balance, ignore_keys)
|
||||
balance = await load_json_from_file(statements[4])
|
||||
balance = await filter_data(balance, ignore_keys)
|
||||
|
||||
income_growth = await load_json_from_file(statements[5])
|
||||
income_growth = await filter_data(income_growth, ignore_keys)
|
||||
income_growth = await load_json_from_file(statements[5])
|
||||
income_growth = await filter_data(income_growth, ignore_keys)
|
||||
|
||||
balance_growth = await load_json_from_file(statements[6])
|
||||
balance_growth = await filter_data(balance_growth, ignore_keys)
|
||||
balance_growth = await load_json_from_file(statements[6])
|
||||
balance_growth = await filter_data(balance_growth, ignore_keys)
|
||||
|
||||
|
||||
cashflow_growth = await load_json_from_file(statements[7])
|
||||
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
|
||||
cashflow_growth = await load_json_from_file(statements[7])
|
||||
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
|
||||
|
||||
owner_earnings = await load_json_from_file(statements[8])
|
||||
owner_earnings = await filter_data(owner_earnings, ignore_keys)
|
||||
owner_earnings = await load_json_from_file(statements[8])
|
||||
owner_earnings = await filter_data(owner_earnings, ignore_keys)
|
||||
|
||||
|
||||
# Combine all the data
|
||||
combined_data = defaultdict(dict)
|
||||
# Combine all the data
|
||||
combined_data = defaultdict(dict)
|
||||
|
||||
# Merge the data based on 'date'
|
||||
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
|
||||
for entry in entries:
|
||||
date = entry['date']
|
||||
for key, value in entry.items():
|
||||
if key not in combined_data[date]:
|
||||
combined_data[date][key] = value
|
||||
# Merge the data based on 'date'
|
||||
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
|
||||
for entry in entries:
|
||||
date = entry['date']
|
||||
for key, value in entry.items():
|
||||
if key not in combined_data[date]:
|
||||
combined_data[date][key] = value
|
||||
|
||||
combined_data = list(combined_data.values())
|
||||
combined_data = list(combined_data.values())
|
||||
|
||||
# Download historical stock data using yfinance
|
||||
df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
|
||||
df = df.rename(columns={'Adj Close': 'close', 'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Volume': 'volume'})
|
||||
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
|
||||
|
||||
df['sma_50'] = df['close'].rolling(window=50).mean()
|
||||
df['sma_200'] = df['close'].rolling(window=200).mean()
|
||||
df['sma_crossover'] = ((df['sma_50'] > df['sma_200']) & (df['sma_50'].shift(1) <= df['sma_200'].shift(1))).astype(int)
|
||||
|
||||
df['ema_50'] = EMAIndicator(close=df['close'], window=50).ema_indicator()
|
||||
df['ema_200'] = EMAIndicator(close=df['close'], window=200).ema_indicator()
|
||||
df['ema_crossover'] = ((df['ema_50'] > df['ema_200']) & (df['ema_50'].shift(1) <= df['ema_200'].shift(1))).astype(int)
|
||||
|
||||
ichimoku = IchimokuIndicator(high=df['high'], low=df['low'])
|
||||
df['ichimoku_a'] = ichimoku.ichimoku_a()
|
||||
df['ichimoku_b'] = ichimoku.ichimoku_b()
|
||||
df['atr'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close']).average_true_range()
|
||||
bb = BollingerBands(close=df['close'])
|
||||
df['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / df['close']
|
||||
|
||||
df['volatility'] = df['close'].rolling(window=30).std()
|
||||
df['daily_return'] = df['close'].pct_change()
|
||||
df['cumulative_return'] = (1 + df['daily_return']).cumprod() - 1
|
||||
df['volume_change'] = df['volume'].pct_change()
|
||||
df['roc'] = df['close'].pct_change(periods=60)
|
||||
df['avg_volume'] = df['volume'].rolling(window=60).mean()
|
||||
df['drawdown'] = df['close'] / df['close'].rolling(window=252).max() - 1
|
||||
# Download historical stock data using yfinance
|
||||
df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
|
||||
df = df.rename(columns={'Adj Close': 'close', 'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Volume': 'volume'})
|
||||
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
|
||||
|
||||
|
||||
df['macd'] = macd(df['close'])
|
||||
df['macd_signal'] = macd_signal(df['close'])
|
||||
df['macd_hist'] = 2*macd_diff(df['close'])
|
||||
df['adx'] = adx(df['high'],df['low'],df['close'])
|
||||
df["adx_pos"] = adx_pos(df['high'],df['low'],df['close'])
|
||||
df["adx_neg"] = adx_neg(df['high'],df['low'],df['close'])
|
||||
df['cci'] = CCIIndicator(high=df['high'], low=df['low'], close=df['close']).cci()
|
||||
df['mfi'] = MFIIndicator(high=df['high'], low=df['low'], close=df['close'], volume=df['volume']).money_flow_index()
|
||||
# Get the list of columns in df
|
||||
df_columns = df.columns
|
||||
df_stats = generate_statistical_features(df)
|
||||
df_ta = generate_ta_features(df)
|
||||
|
||||
df['nvi'] = NegativeVolumeIndexIndicator(close=df['close'], volume=df['volume']).negative_volume_index()
|
||||
df['obv'] = OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
|
||||
df['vpt'] = VolumePriceTrendIndicator(close=df['close'], volume=df['volume']).volume_price_trend()
|
||||
# Filter columns in df_stats and df_ta that are not in df
|
||||
df_stats_filtered = df_stats.drop(columns=df_columns.intersection(df_stats.columns), errors='ignore')
|
||||
df_ta_filtered = df_ta.drop(columns=df_columns.intersection(df_ta.columns), errors='ignore')
|
||||
ta_columns = df_ta_filtered.columns.tolist()
|
||||
stats_columns = df_stats_filtered.columns.tolist()
|
||||
|
||||
df['rsi'] = rsi(df["close"], window=60)
|
||||
df['rolling_rsi'] = df['rsi'].rolling(window=10).mean()
|
||||
df['stoch_rsi'] = stochrsi_k(df['close'], window=60, smooth1=3, smooth2=3)
|
||||
df['rolling_stoch_rsi'] = df['stoch_rsi'].rolling(window=10).mean()
|
||||
|
||||
df['adi'] = acc_dist_index(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'])
|
||||
df['cmf'] = chaikin_money_flow(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'], window=20)
|
||||
df['emv'] = ease_of_movement(high=df['high'],low=df['low'],volume=df['volume'], window=20)
|
||||
df['fi'] = force_index(close=df['close'], volume=df['volume'], window= 13)
|
||||
|
||||
df['williams'] = WilliamsRIndicator(high=df['high'], low=df['low'], close=df['close']).williams_r()
|
||||
df['kama'] = KAMAIndicator(close=df['close']).kama()
|
||||
|
||||
df['stoch'] = stoch(df['high'], df['low'], df['close'], window=30)
|
||||
df['rocr'] = df['close'] / df['close'].shift(30) - 1 # Rate of Change Ratio (ROCR)
|
||||
df['ppo'] = (df['ema_50'] - df['ema_200']) / df['ema_50'] * 100
|
||||
df['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
|
||||
df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
|
||||
|
||||
df['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
|
||||
df['tii'] = trend_intensity(df['close'])
|
||||
# Concatenate df with the filtered df_stats and df_ta
|
||||
df = pd.concat([df, df_ta_filtered, df_stats_filtered], axis=1)
|
||||
|
||||
|
||||
ta_indicators = [
|
||||
'rsi', 'macd', 'macd_signal', 'macd_hist', 'adx', 'adx_pos', 'adx_neg',
|
||||
'cci', 'mfi', 'nvi', 'obv', 'vpt', 'stoch_rsi','bb_width',
|
||||
'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover',
|
||||
'volatility','daily_return','cumulative_return', 'roc','avg_volume',
|
||||
'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b',
|
||||
'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','drawdown',
|
||||
'volume_change'
|
||||
]
|
||||
# Match each combined data entry with the closest available stock price in df
|
||||
for item in combined_data:
|
||||
target_date = item['date']
|
||||
counter = 0
|
||||
max_attempts = 10
|
||||
|
||||
# Match each combined data entry with the closest available stock price in df
|
||||
for item in combined_data:
|
||||
target_date = item['date']
|
||||
counter = 0
|
||||
max_attempts = 10
|
||||
# Look for the closest matching date in the stock data
|
||||
while target_date not in df['date'].values and counter < max_attempts:
|
||||
target_date = (pd.to_datetime(target_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
|
||||
counter += 1
|
||||
|
||||
# Look for the closest matching date in the stock data
|
||||
while target_date not in df['date'].values and counter < max_attempts:
|
||||
target_date = (pd.to_datetime(target_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
|
||||
counter += 1
|
||||
# If max attempts are reached and no matching date is found, skip the entry
|
||||
if counter == max_attempts:
|
||||
continue
|
||||
|
||||
# If max attempts are reached and no matching date is found, skip the entry
|
||||
if counter == max_attempts:
|
||||
continue
|
||||
# Find the close price for the matching date
|
||||
close_price = round(df[df['date'] == target_date]['close'].values[0], 2)
|
||||
item['price'] = close_price
|
||||
|
||||
# Find the close price for the matching date
|
||||
close_price = round(df[df['date'] == target_date]['close'].values[0], 2)
|
||||
item['price'] = close_price
|
||||
# Dynamically add all indicator values to the combined_data entry
|
||||
|
||||
# Dynamically add all indicator values to the combined_data entry
|
||||
|
||||
for indicator in ta_indicators:
|
||||
indicator_value = df[df['date'] == target_date][indicator].values[0]
|
||||
item[indicator] = indicator_value # Add the indicator value to the combined_data entry
|
||||
for column in ta_columns:
|
||||
column_value = df[df['date'] == target_date][column].values[0]
|
||||
item[column] = column_value # Add the column value to the combined_data entry
|
||||
for column in stats_columns:
|
||||
column_value = df[df['date'] == target_date][column].values[0]
|
||||
item[column] = column_value # Add the column value to the combined_data entry
|
||||
|
||||
|
||||
# Sort the combined data by date
|
||||
combined_data = sorted(combined_data, key=lambda x: x['date'])
|
||||
# Convert combined data into a DataFrame
|
||||
df_combined = pd.DataFrame(combined_data).dropna()
|
||||
# Sort the combined data by date
|
||||
combined_data = sorted(combined_data, key=lambda x: x['date'])
|
||||
# Convert combined data into a DataFrame
|
||||
df_combined = pd.DataFrame(combined_data).dropna()
|
||||
'''
|
||||
fundamental_columns = [
|
||||
'revenue',
|
||||
'costOfRevenue',
|
||||
'grossProfit',
|
||||
'netIncome',
|
||||
'operatingIncome',
|
||||
'operatingExpenses',
|
||||
'researchAndDevelopmentExpenses',
|
||||
'ebitda',
|
||||
'freeCashFlow',
|
||||
'incomeBeforeTax',
|
||||
'incomeTaxExpense',
|
||||
'debtRepayment',
|
||||
'dividendsPaid',
|
||||
'depreciationAndAmortization',
|
||||
'netCashUsedProvidedByFinancingActivities',
|
||||
'changeInWorkingCapital',
|
||||
'stockBasedCompensation',
|
||||
'deferredIncomeTax',
|
||||
'commonStockRepurchased',
|
||||
'operatingCashFlow',
|
||||
'capitalExpenditure',
|
||||
'accountsReceivables',
|
||||
'purchasesOfInvestments',
|
||||
'cashAndCashEquivalents',
|
||||
'shortTermInvestments',
|
||||
'cashAndShortTermInvestments',
|
||||
'longTermInvestments',
|
||||
'otherCurrentLiabilities',
|
||||
'totalCurrentLiabilities',
|
||||
'longTermDebt',
|
||||
'totalDebt',
|
||||
'netDebt',
|
||||
'commonStock',
|
||||
'totalEquity',
|
||||
'totalLiabilitiesAndStockholdersEquity',
|
||||
'totalStockholdersEquity',
|
||||
'totalInvestments',
|
||||
'taxAssets',
|
||||
'totalAssets',
|
||||
'inventory',
|
||||
'propertyPlantEquipmentNet',
|
||||
'ownersEarnings',
|
||||
]
|
||||
|
||||
# Compute ratios for all combinations of key elements
|
||||
new_columns = {}
|
||||
# Loop over combinations of column pairs
|
||||
for columns in [fundamental_columns]:
|
||||
for num, denom in combinations(columns, 2):
|
||||
# Compute ratio and reverse ratio
|
||||
ratio = df_combined[num] / df_combined[denom]
|
||||
reverse_ratio = round(df_combined[denom] / df_combined[num],2)
|
||||
|
||||
# Define column names for both ratios
|
||||
column_name = f'{num}_to_{denom}'
|
||||
reverse_column_name = f'{denom}_to_{num}'
|
||||
|
||||
# Store the new columns in the dictionary, replacing invalid values with 0
|
||||
new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
|
||||
new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
|
||||
|
||||
# Add all new columns to the original DataFrame at once
|
||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
|
||||
'''
|
||||
# To defragment the DataFrame, make a copy
|
||||
df_combined = df_combined.copy()
|
||||
df_combined = df_combined.dropna()
|
||||
df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
|
||||
|
||||
|
||||
key_elements = [
|
||||
'revenue',
|
||||
'costOfRevenue',
|
||||
'grossProfit',
|
||||
'netIncome',
|
||||
'operatingIncome',
|
||||
'operatingExpenses',
|
||||
'researchAndDevelopmentExpenses',
|
||||
'ebitda',
|
||||
'freeCashFlow',
|
||||
'incomeBeforeTax',
|
||||
'incomeTaxExpense',
|
||||
'debtRepayment',
|
||||
'dividendsPaid',
|
||||
'depreciationAndAmortization',
|
||||
'netCashUsedProvidedByFinancingActivities',
|
||||
'changeInWorkingCapital',
|
||||
'stockBasedCompensation',
|
||||
'deferredIncomeTax',
|
||||
'commonStockRepurchased',
|
||||
'operatingCashFlow',
|
||||
'capitalExpenditure',
|
||||
'accountsReceivables',
|
||||
'purchasesOfInvestments',
|
||||
'cashAndCashEquivalents',
|
||||
'shortTermInvestments',
|
||||
'cashAndShortTermInvestments',
|
||||
'longTermInvestments',
|
||||
'otherCurrentLiabilities',
|
||||
'totalCurrentLiabilities',
|
||||
'longTermDebt',
|
||||
'totalDebt',
|
||||
'netDebt',
|
||||
'commonStock',
|
||||
'totalEquity',
|
||||
'totalLiabilitiesAndStockholdersEquity',
|
||||
'totalStockholdersEquity',
|
||||
'totalInvestments',
|
||||
'taxAssets',
|
||||
'totalAssets',
|
||||
'inventory',
|
||||
'propertyPlantEquipmentNet',
|
||||
'ownersEarnings',
|
||||
]
|
||||
# Compute ratios for all combinations of key elements
|
||||
df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
|
||||
|
||||
new_columns = {}
|
||||
df_copy = df_combined.copy()
|
||||
df_copy = df_copy.map(lambda x: round(x, 2) if isinstance(x, float) else x)
|
||||
|
||||
# Loop over combinations of column pairs
|
||||
for num, denom in combinations(key_elements, 2):
|
||||
# Compute ratio and reverse ratio
|
||||
ratio = df_combined[num] / df_combined[denom]
|
||||
reverse_ratio = df_combined[denom] / df_combined[num]
|
||||
if df_copy.shape[0] > 0:
|
||||
with open(file_path, 'wb') as file:
|
||||
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
||||
|
||||
# Define column names for both ratios
|
||||
column_name = f'{num}_to_{denom}'
|
||||
reverse_column_name = f'{denom}_to_{num}'
|
||||
return df_copy
|
||||
|
||||
# Store the new columns in the dictionary, replacing invalid values with 0
|
||||
new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
|
||||
new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
|
||||
|
||||
# Add all new columns to the original DataFrame at once
|
||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
|
||||
|
||||
|
||||
# To defragment the DataFrame, make a copy
|
||||
df_combined = df_combined.copy()
|
||||
|
||||
|
||||
# Create 'Target' column based on price change
|
||||
df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
|
||||
|
||||
# Return a copy of the combined DataFrame
|
||||
df_combined = df_combined.dropna()
|
||||
df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
|
||||
df_copy = df_combined.copy()
|
||||
|
||||
return df_copy
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
|
||||
async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
@ -327,8 +298,6 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
async def warm_start_training(tickers, con):
|
||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||
@ -336,7 +305,7 @@ async def warm_start_training(tickers, con):
|
||||
df_test = pd.DataFrame()
|
||||
test_size = 0.2
|
||||
|
||||
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10)
|
||||
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=1)
|
||||
|
||||
train_list = []
|
||||
test_list = []
|
||||
@ -356,13 +325,14 @@ async def warm_start_training(tickers, con):
|
||||
# Concatenate all at once outside the loop
|
||||
df_train = pd.concat(train_list, ignore_index=True)
|
||||
df_test = pd.concat(test_list, ignore_index=True)
|
||||
df_train = df_train.sample(frac=1).reset_index(drop=True)
|
||||
df_test = df_test.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
print('======Warm Start Train Set Datapoints======')
|
||||
df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
|
||||
print(len(df_train))
|
||||
|
||||
predictor = ScorePredictor()
|
||||
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']]
|
||||
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200)
|
||||
predictor.warm_start_training(df_train[selected_features], df_train['Target'])
|
||||
predictor.evaluate_model(df_test[selected_features], df_test['Target'])
|
||||
|
||||
@ -370,7 +340,7 @@ async def warm_start_training(tickers, con):
|
||||
|
||||
async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
try:
|
||||
df = await download_data(ticker, con, start_date, end_date)
|
||||
df = await download_data(ticker,con, start_date, end_date)
|
||||
if df is None or len(df) == 0:
|
||||
print(f"No data available for {ticker}")
|
||||
return
|
||||
@ -380,7 +350,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
train_data = df.iloc[:split_size]
|
||||
test_data = df.iloc[split_size:]
|
||||
|
||||
selected_features = [col for col in df.columns if col not in ['date','price','Target']]
|
||||
selected_features = top_uncorrelated_features(train_data,top_n=50) #[col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
|
||||
# Fine-tune the model
|
||||
predictor = ScorePredictor()
|
||||
predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
|
||||
@ -402,25 +372,27 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
|
||||
del predictor # Explicitly delete the predictor to aid garbage collection
|
||||
|
||||
async def run():
|
||||
train_mode = False # Set this to False for fine-tuning and evaluation
|
||||
train_mode = True # Set this to False for fine-tuning and evaluation
|
||||
con = sqlite3.connect('stocks.db')
|
||||
cursor = con.cursor()
|
||||
cursor.execute("PRAGMA journal_mode = wal")
|
||||
|
||||
if train_mode:
|
||||
# Warm start training
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
|
||||
warm_start_symbols = [row[0] for row in cursor.fetchall()]
|
||||
print('Warm Start Training for:', warm_start_symbols)
|
||||
predictor = await warm_start_training(warm_start_symbols, con)
|
||||
else:
|
||||
# Fine-tuning and evaluation for all stocks
|
||||
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
|
||||
stock_symbols = [row[0] for row in cursor.fetchall()]
|
||||
stock_symbols = ['AWR'] #[row[0] for row in cursor.fetchall()]
|
||||
|
||||
print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
|
||||
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
|
||||
end_date = datetime.today().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
tasks = []
|
||||
for ticker in tqdm(stock_symbols):
|
||||
await fine_tune_and_evaluate(ticker, con, start_date, end_date)
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
from benzinga import financial_data
|
||||
import requests
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
@ -11,12 +10,12 @@ from dotenv import load_dotenv
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
import aiohttp
|
||||
import asyncio
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv('BENZINGA_API_KEY')
|
||||
|
||||
fin = financial_data.Benzinga(api_key)
|
||||
|
||||
headers = {"accept": "application/json"}
|
||||
|
||||
|
||||
@ -58,7 +57,7 @@ def calculate_rating(data):
|
||||
last_rating_date = datetime.strptime(last_rating, "%Y-%m-%d")
|
||||
difference = (datetime.now() - last_rating_date).days
|
||||
except:
|
||||
difference = 1000 # In case of None
|
||||
difference = 1000 # In case of None or invalid date
|
||||
|
||||
if total_ratings == 0 or difference >= 600:
|
||||
return 0
|
||||
@ -80,6 +79,7 @@ def calculate_rating(data):
|
||||
max_rating = 5
|
||||
normalized_rating = min(max(weighted_sum / (weight_return + weight_success_rate + weight_total_ratings + weight_difference), min_rating), max_rating)
|
||||
|
||||
# Apply additional conditions based on total ratings and average return
|
||||
if normalized_rating >= 4:
|
||||
if total_ratings < 10:
|
||||
normalized_rating -= 2.4
|
||||
@ -89,78 +89,18 @@ def calculate_rating(data):
|
||||
normalized_rating -= 0.75
|
||||
elif total_ratings < 30:
|
||||
normalized_rating -= 1
|
||||
elif overall_average_return <=10:
|
||||
normalized_rating -=1.1
|
||||
'''
|
||||
if overall_average_return <= 0 and overall_average_return >= -5:
|
||||
normalized_rating = min(normalized_rating - 2, 0)
|
||||
elif overall_average_return < -5 and overall_average_return >= -10:
|
||||
normalized_rating = min(normalized_rating - 3, 0)
|
||||
else:
|
||||
normalized_rating = min(normalized_rating - 4, 0)
|
||||
'''
|
||||
if overall_average_return <= 0:
|
||||
normalized_rating = min(normalized_rating - 2, 0)
|
||||
elif overall_average_return <= 10:
|
||||
normalized_rating -= 1.1
|
||||
|
||||
normalized_rating = max(normalized_rating, 0)
|
||||
if overall_average_return <= 0:
|
||||
normalized_rating = max(normalized_rating - 2, 0)
|
||||
|
||||
# Cap the rating if the last rating is older than 30 days
|
||||
if difference > 30:
|
||||
normalized_rating = min(normalized_rating, 4.5)
|
||||
|
||||
return round(normalized_rating, 2)
|
||||
|
||||
def get_analyst_ratings(analyst_id):
|
||||
|
||||
url = "https://api.benzinga.com/api/v2.1/calendar/ratings"
|
||||
res_list = []
|
||||
|
||||
for page in range(0,5):
|
||||
try:
|
||||
querystring = {"token":api_key,"parameters[analyst_id]": analyst_id, "page": str(page), "pagesize":"1000"}
|
||||
response = requests.request("GET", url, headers=headers, params=querystring)
|
||||
data = ujson.loads(response.text)['ratings']
|
||||
res_list +=data
|
||||
time.sleep(2)
|
||||
except:
|
||||
break
|
||||
|
||||
return res_list
|
||||
|
||||
def get_all_analyst_stats():
|
||||
url = "https://api.benzinga.com/api/v2.1/calendar/ratings/analysts"
|
||||
res_list = []
|
||||
for _ in range(0,20): #Run the api N times because not all analyst are counted Bug from benzinga
|
||||
for page in range(0,100):
|
||||
try:
|
||||
querystring = {"token":api_key,"page": f"{page}", 'pagesize': "1000"}
|
||||
response = requests.request("GET", url, headers=headers, params=querystring)
|
||||
|
||||
data = ujson.loads(response.text)['analyst_ratings_analyst']
|
||||
res_list+=data
|
||||
except:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
res_list = remove_duplicates(res_list, 'id') # remove duplicates of analyst
|
||||
res_list = [item for item in res_list if item.get('ratings_accuracy', {}).get('total_ratings', 0) != 0]
|
||||
|
||||
final_list = []
|
||||
for item in res_list:
|
||||
analyst_dict = {
|
||||
'analystName': item['name_full'],
|
||||
'companyName': item['firm_name'],
|
||||
'analystId': item['id'],
|
||||
'firmId': item['firm_id']
|
||||
}
|
||||
|
||||
stats_dict = {
|
||||
'avgReturn': item['ratings_accuracy'].get('overall_average_return', 0),
|
||||
'successRate': item['ratings_accuracy'].get('overall_success_rate', 0),
|
||||
'totalRatings': item['ratings_accuracy'].get('total_ratings', 0),
|
||||
}
|
||||
|
||||
final_list.append({**analyst_dict,**stats_dict})
|
||||
|
||||
|
||||
return final_list
|
||||
|
||||
def get_top_stocks():
|
||||
with open(f"json/analyst/all-analyst-data.json", 'r') as file:
|
||||
analyst_stats_list = ujson.load(file)
|
||||
@ -217,24 +157,97 @@ def get_top_stocks():
|
||||
ujson.dump(result_sorted, file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def get_analyst_ratings(analyst_id, session):
|
||||
url = "https://api.benzinga.com/api/v2.1/calendar/ratings"
|
||||
res_list = []
|
||||
|
||||
for page in range(5):
|
||||
try:
|
||||
querystring = {
|
||||
"token": api_key,
|
||||
"parameters[analyst_id]": analyst_id,
|
||||
"page": str(page),
|
||||
"pagesize": "1000"
|
||||
}
|
||||
async with session.get(url, headers=headers, params=querystring) as response:
|
||||
data = await response.json()
|
||||
ratings = data.get('ratings', [])
|
||||
if not ratings:
|
||||
break # Stop fetching if no more ratings
|
||||
res_list += ratings
|
||||
except Exception as e:
|
||||
#print(f"Error fetching page {page} for analyst {analyst_id}: {e}")
|
||||
break
|
||||
|
||||
return res_list
|
||||
|
||||
async def get_all_analyst_stats():
|
||||
url = "https://api.benzinga.com/api/v2.1/calendar/ratings/analysts"
|
||||
res_list = []
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [
|
||||
session.get(url, headers=headers, params={"token": api_key, "page": str(page), 'pagesize': "1000"})
|
||||
for page in range(100)
|
||||
]
|
||||
|
||||
# Gather responses concurrently
|
||||
responses = await asyncio.gather(*tasks)
|
||||
|
||||
# Process each response
|
||||
for response in responses:
|
||||
if response.status == 200: # Check for successful response
|
||||
try:
|
||||
data = ujson.loads(await response.text())['analyst_ratings_analyst']
|
||||
res_list += data
|
||||
except Exception as e:
|
||||
pass
|
||||
print(len(res_list))
|
||||
# Remove duplicates of analysts and filter based on ratings accuracy
|
||||
res_list = remove_duplicates(res_list, 'id')
|
||||
res_list = [item for item in res_list if item.get('ratings_accuracy', {}).get('total_ratings', 0) != 0]
|
||||
|
||||
# Construct the final result list
|
||||
final_list = [{
|
||||
'analystName': item['name_full'],
|
||||
'companyName': item['firm_name'],
|
||||
'analystId': item['id'],
|
||||
'firmId': item['firm_id'],
|
||||
'avgReturn': item['ratings_accuracy'].get('overall_average_return', 0),
|
||||
'successRate': item['ratings_accuracy'].get('overall_success_rate', 0),
|
||||
'totalRatings': item['ratings_accuracy'].get('total_ratings', 0),
|
||||
} for item in res_list]
|
||||
|
||||
return final_list
|
||||
|
||||
async def process_analyst(item, session):
|
||||
data = await get_analyst_ratings(item['analystId'], session)
|
||||
item['ratingsList'] = data
|
||||
item['totalRatings'] = len(data) # True total ratings
|
||||
item['lastRating'] = data[0]['date'] if data else None
|
||||
item['numOfStocks'] = len({d['ticker'] for d in data})
|
||||
|
||||
# Stats dictionary for calculating score
|
||||
stats_dict = {
|
||||
'avgReturn': item.get('avgReturn', 0),
|
||||
'successRate': item.get('successRate', 0),
|
||||
'totalRatings': item['totalRatings'],
|
||||
'lastRating': item['lastRating'],
|
||||
}
|
||||
item['analystScore'] = calculate_rating(stats_dict)
|
||||
|
||||
async def get_single_analyst_data(analyst_list):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [process_analyst(item, session) for item in analyst_list]
|
||||
for task in tqdm(asyncio.as_completed(tasks), total=len(analyst_list)):
|
||||
await task
|
||||
|
||||
async def run():
|
||||
#Step1 get all analyst id's and stats
|
||||
analyst_list = get_all_analyst_stats()
|
||||
analyst_list = await get_all_analyst_stats()
|
||||
print('Number of analyst:', len(analyst_list))
|
||||
#Step2 get rating history for each individual analyst and score the analyst
|
||||
for item in tqdm(analyst_list):
|
||||
data = get_analyst_ratings(item['analystId'])
|
||||
item['ratingsList'] = data
|
||||
item['totalRatings'] = len(data) #true total ratings, which is important for the score
|
||||
item['lastRating'] = data[0]['date'] if len(data) > 0 else None
|
||||
item['numOfStocks'] = len({item['ticker'] for item in data})
|
||||
stats_dict = {
|
||||
'avgReturn': item.get('avgReturn', 0),
|
||||
'successRate': item.get('successRate', 0),
|
||||
'totalRatings': item.get('totalRatings', 0),
|
||||
'lastRating': item.get('lastRating', None),
|
||||
}
|
||||
item['analystScore'] = calculate_rating(stats_dict)
|
||||
await get_single_analyst_data(analyst_list)
|
||||
|
||||
try:
|
||||
con = sqlite3.connect('stocks.db')
|
||||
@ -279,9 +292,8 @@ if __name__ == "__main__":
|
||||
'successRate': item['successRate'],
|
||||
'avgReturn': item['avgReturn'],
|
||||
'totalRatings': item['totalRatings'],
|
||||
'lastRating': item['lastRating'],
|
||||
'mainSectors': item['mainSectors']
|
||||
})
|
||||
'lastRating': item['lastRating']
|
||||
})
|
||||
|
||||
with open(f"json/analyst/top-analysts.json", 'w') as file:
|
||||
ujson.dump(top_analysts_list, file)
|
||||
@ -292,3 +304,7 @@ if __name__ == "__main__":
|
||||
|
||||
#Save top stocks with strong buys from 5 star analysts
|
||||
get_top_stocks()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run())
|
||||
@ -220,12 +220,12 @@ def run(chunk,analyst_list):
|
||||
|
||||
|
||||
try:
|
||||
stock_con = sqlite3.connect('stocks.db')
|
||||
stock_cursor = stock_con.cursor()
|
||||
stock_cursor.execute("SELECT DISTINCT symbol FROM stocks")
|
||||
con = sqlite3.connect('stocks.db')
|
||||
stock_cursor = con.cursor()
|
||||
stock_cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
|
||||
stock_symbols = [row[0] for row in stock_cursor.fetchall()]
|
||||
|
||||
stock_con.close()
|
||||
con.close()
|
||||
|
||||
#Save all analyst data in raw form for the next step
|
||||
with open(f"json/analyst/all-analyst-data.json", 'r') as file:
|
||||
|
||||
Binary file not shown.
@ -1,7 +1,6 @@
|
||||
import yfinance as yf
|
||||
import pandas as pd
|
||||
from datetime import datetime, timedelta
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
import numpy as np
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
|
||||
@ -20,13 +19,9 @@ class ScorePredictor:
|
||||
self.scaler = MinMaxScaler()
|
||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||
self.model = XGBClassifier(
|
||||
n_estimators=200, # Increased from 100 due to problem complexity
|
||||
max_depth=6, # Reduced to prevent overfitting with many features
|
||||
learning_rate=0.1, # Added to control the learning process
|
||||
colsample_bytree=0.8, # Added to randomly sample columns for each tree
|
||||
subsample=0.8, # Added to randomly sample training data
|
||||
reg_alpha=1, # L1 regularization to handle many features
|
||||
reg_lambda=1, # L2 regularization to handle many features
|
||||
n_estimators=200,
|
||||
max_depth=10,
|
||||
learning_rate=0.1,
|
||||
random_state=42,
|
||||
n_jobs=10
|
||||
)
|
||||
|
||||
BIN
app/utils/__pycache__/feature_engineering.cpython-310.pyc
Normal file
BIN
app/utils/__pycache__/feature_engineering.cpython-310.pyc
Normal file
Binary file not shown.
204
app/utils/feature_engineering.py
Normal file
204
app/utils/feature_engineering.py
Normal file
@ -0,0 +1,204 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from ta.momentum import *
|
||||
from ta.trend import *
|
||||
from ta.volatility import *
|
||||
from ta.volume import *
|
||||
|
||||
|
||||
|
||||
def trend_intensity(close, window=20):
|
||||
ma = close.rolling(window=window).mean()
|
||||
std = close.rolling(window=window).std()
|
||||
return ((close - ma) / std).abs().rolling(window=window).mean()
|
||||
|
||||
|
||||
def calculate_fdi(high, low, close, window=30):
|
||||
n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
|
||||
np.log(close.rolling(window=window).max() - close.rolling(window=window).min())) / np.log(2)
|
||||
return (2 - n1) * 100
|
||||
|
||||
|
||||
def generate_ta_features(df):
|
||||
|
||||
df_features = df.copy()
|
||||
|
||||
df_features['sma_50'] = df['close'].rolling(window=50).mean()
|
||||
df_features['sma_200'] = df['close'].rolling(window=200).mean()
|
||||
df_features['sma_crossover'] = ((df_features['sma_50'] > df_features['sma_200']) & (df_features['sma_50'].shift(1) <= df_features['sma_200'].shift(1))).astype(int)
|
||||
|
||||
df_features['ema_50'] = EMAIndicator(close=df['close'], window=50).ema_indicator()
|
||||
df_features['ema_200'] = EMAIndicator(close=df['close'], window=200).ema_indicator()
|
||||
df_features['ema_crossover'] = ((df_features['ema_50'] > df_features['ema_200']) & (df_features['ema_50'].shift(1) <= df_features['ema_200'].shift(1))).astype(int)
|
||||
|
||||
df_features['wma'] = WMAIndicator(df['close'], window = 30).wma()
|
||||
|
||||
ichimoku = IchimokuIndicator(high=df['high'], low=df['low'])
|
||||
df_features['ichimoku_a'] = ichimoku.ichimoku_a()
|
||||
df_features['ichimoku_b'] = ichimoku.ichimoku_b()
|
||||
df_features['atr'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close']).average_true_range()
|
||||
bb = BollingerBands(close=df['close'])
|
||||
df_features['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / df['close']
|
||||
|
||||
|
||||
df_features['macd'] = macd(df['close'])
|
||||
df_features['macd_signal'] = macd_signal(df['close'])
|
||||
df_features['macd_hist'] = 2*macd_diff(df['close'])
|
||||
df_features['adx'] = adx(df['high'],df['low'],df['close'])
|
||||
df_features["adx_pos"] = adx_pos(df['high'],df['low'],df['close'])
|
||||
df_features["adx_neg"] = adx_neg(df['high'],df['low'],df['close'])
|
||||
df_features['cci'] = CCIIndicator(high=df['high'], low=df['low'], close=df['close']).cci()
|
||||
df_features['mfi'] = MFIIndicator(high=df['high'], low=df['low'], close=df['close'], volume=df['volume']).money_flow_index()
|
||||
|
||||
df_features['nvi'] = NegativeVolumeIndexIndicator(close=df['close'], volume=df['volume']).negative_volume_index()
|
||||
df_features['obv'] = OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
|
||||
df_features['vpt'] = VolumePriceTrendIndicator(close=df['close'], volume=df['volume']).volume_price_trend()
|
||||
|
||||
df_features['rsi'] = rsi(df["close"], window=60)
|
||||
df_features['rolling_rsi'] = df_features['rsi'].rolling(window=10).mean()
|
||||
df_features['stoch_rsi'] = stochrsi_k(df['close'], window=60, smooth1=3, smooth2=3)
|
||||
df_features['rolling_stoch_rsi'] = df_features['stoch_rsi'].rolling(window=10).mean()
|
||||
|
||||
df_features['adi'] = acc_dist_index(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'])
|
||||
df_features['cmf'] = chaikin_money_flow(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'], window=20)
|
||||
df_features['emv'] = ease_of_movement(high=df['high'],low=df['low'],volume=df['volume'], window=20)
|
||||
df_features['fi'] = force_index(close=df['close'], volume=df['volume'], window= 13)
|
||||
|
||||
df_features['williams'] = WilliamsRIndicator(high=df['high'], low=df['low'], close=df['close']).williams_r()
|
||||
df_features['kama'] = KAMAIndicator(close=df['close']).kama()
|
||||
|
||||
stoch = StochasticOscillator(high=df['high'], low=df['low'], close=df['close'], window=60, smooth_window=3)
|
||||
df_features['stoch_k'] = stoch.stoch()
|
||||
df_features['stoch_d'] = stoch.stoch_signal()
|
||||
|
||||
df_features['rocr'] = df['close'] / df['close'].shift(30) - 1 # Rate of Change Ratio (ROCR)
|
||||
df_features['ppo'] = (df_features['ema_50'] - df_features['ema_200']) / df_features['ema_50'] * 100
|
||||
df_features['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
|
||||
df_features['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
|
||||
|
||||
df_features['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
|
||||
df_features['tii'] = trend_intensity(df['close'])
|
||||
|
||||
df_features['fft'] = np.abs(np.fft.fft(df['close']))
|
||||
don_channel = DonchianChannel(high=df['high'], low=df['low'],close=df['close'], window=60)
|
||||
df_features['don_hband'] = don_channel.donchian_channel_hband()
|
||||
df_features['don_lband'] = don_channel.donchian_channel_lband()
|
||||
df_features['don_mband'] = don_channel.donchian_channel_mband()
|
||||
df_features['don_pband'] = don_channel.donchian_channel_pband()
|
||||
df_features['don_wband'] = don_channel.donchian_channel_wband()
|
||||
|
||||
aroon = AroonIndicator(high=df['high'], low=df['low'], window=60)
|
||||
df_features['aroon_down'] = aroon.aroon_down()
|
||||
df_features['aroon_indicator'] = aroon.aroon_indicator()
|
||||
df_features['aroon_up'] = aroon.aroon_up()
|
||||
|
||||
df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
|
||||
df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
|
||||
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
|
||||
df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
||||
df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
|
||||
|
||||
df_features = df_features.dropna()
|
||||
return df_features
|
||||
|
||||
def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
||||
high_col='high', low_col='low', volume_col='volume'):
|
||||
"""
|
||||
Generate comprehensive statistical features for financial time series data.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
df : pandas.DataFrame
|
||||
DataFrame containing the price and volume data
|
||||
windows : list
|
||||
List of rolling window sizes to use for feature generation
|
||||
price_col : str
|
||||
Name of the closing price column
|
||||
high_col : str
|
||||
Name of the high price column
|
||||
low_col : str
|
||||
Name of the low price column
|
||||
volume_col : str
|
||||
Name of the volume column
|
||||
|
||||
Returns:
|
||||
--------
|
||||
pandas.DataFrame
|
||||
DataFrame with additional statistical features
|
||||
"""
|
||||
|
||||
# Create a copy of the dataframe to avoid modifying the original
|
||||
df_features = df.copy()
|
||||
|
||||
|
||||
# Calculate features for each window size
|
||||
for window in windows:
|
||||
# Returns
|
||||
df_features[f'returns_{window}'] = df[price_col].pct_change(periods=window)
|
||||
|
||||
# Log returns and statistics
|
||||
log_returns = np.log(df[price_col]/df[price_col].shift(1))
|
||||
df_features[f'log_returns_{window}'] = log_returns.rolling(window=window).mean()
|
||||
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
|
||||
|
||||
# Statistical moments
|
||||
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
|
||||
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
|
||||
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
|
||||
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
|
||||
|
||||
# Volatility measures
|
||||
df_features[f'realized_vol_{window}'] = (
|
||||
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
|
||||
df_features[f'range_vol_{window}'] = (
|
||||
(df[high_col].rolling(window=window).max() -
|
||||
df[low_col].rolling(window=window).min()) / df[price_col])
|
||||
|
||||
# Z-scores and normalized prices
|
||||
df_features[f'zscore_{window}'] = (
|
||||
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
||||
df[price_col].rolling(window=window).std())
|
||||
df_features[f'norm_price_{window}'] = (
|
||||
df[price_col] / df[price_col].rolling(window=window).mean() - 1)
|
||||
|
||||
|
||||
# Correlation features
|
||||
if volume_col in df.columns:
|
||||
df_features[f'volume_price_corr_{window}'] = (
|
||||
df[price_col].rolling(window=window).corr(df[volume_col]))
|
||||
df_features[f'high_low_corr_{window}'] = (
|
||||
df[high_col].rolling(window=window).corr(df[low_col]))
|
||||
|
||||
|
||||
|
||||
# Quantile features
|
||||
for q in [0.25, 0.75]:
|
||||
df_features[f'price_q{int(q*100)}_{window}'] = (
|
||||
df[price_col].rolling(window=window).quantile(q))
|
||||
|
||||
# Price dynamics
|
||||
df_features['price_acceleration'] = df[price_col].diff().diff()
|
||||
df_features['momentum_change'] = df[price_col].pct_change().diff()
|
||||
|
||||
# Advanced volatility
|
||||
df_features['parkinson_vol'] = np.sqrt(
|
||||
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
|
||||
|
||||
# Efficiency ratio
|
||||
df_features['price_efficiency'] = (
|
||||
abs(df[price_col] - df[price_col].shift(20)) /
|
||||
(df[high_col].rolling(20).max() - df[low_col].rolling(20).min())
|
||||
)
|
||||
|
||||
# Deviation metrics
|
||||
df_features['deviation_from_vwap'] = (
|
||||
(df[price_col] - df[price_col].rolling(window=20).mean()) /
|
||||
df[price_col].rolling(window=20).mean()
|
||||
)
|
||||
|
||||
df_features['stock_return'] = df['close'].pct_change()
|
||||
|
||||
df_features = df_features.dropna()
|
||||
return df_features
|
||||
Loading…
x
Reference in New Issue
Block a user