update analyst cron job

This commit is contained in:
MuslemRahimi 2024-10-04 20:01:23 +02:00
parent 5e802c117a
commit 144a79f00a
7 changed files with 570 additions and 383 deletions

View File

@ -12,12 +12,9 @@ from tqdm import tqdm
import concurrent.futures import concurrent.futures
import re import re
from itertools import combinations from itertools import combinations
import os
from ta.momentum import *
from ta.trend import *
from ta.volatility import *
from ta.volume import *
import gc import gc
from utils.feature_engineering import *
#Enable automatic garbage collection #Enable automatic garbage collection
gc.enable() gc.enable()
@ -25,288 +22,262 @@ async def save_json(symbol, data):
with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file: with open(f"json/ai-score/companies/{symbol}.json", 'wb') as file:
file.write(orjson.dumps(data)) file.write(orjson.dumps(data))
def top_uncorrelated_features(df, target_col='Target', top_n=10, threshold=0.75):
def trend_intensity(close, window=20): # Drop the columns to exclude from the DataFrame
ma = close.rolling(window=window).mean() df_filtered = df.drop(columns=['date','price'])
std = close.rolling(window=window).std()
return ((close - ma) / std).abs().rolling(window=window).mean() # Compute the correlation matrix
correlation_matrix = df_filtered.corr()
def calculate_fdi(high, low, close, window=30): # Get the correlations with the target column, sorted by absolute value
n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) - correlations_with_target = correlation_matrix[target_col].drop(target_col).abs().sort_values(ascending=False)
np.log(close.rolling(window=window).max() - close.rolling(window=window).min())) / np.log(2)
return (2 - n1) * 100 # Initialize the list of selected features
selected_features = []
# Iteratively select the most correlated features while minimizing correlation with each other
for feature in correlations_with_target.index:
# If we already have enough features, break
if len(selected_features) >= top_n:
break
# Check correlation of this feature with already selected features
is_uncorrelated = True
for selected in selected_features:
if abs(correlation_matrix.loc[feature, selected]) > threshold:
is_uncorrelated = False
break
# If it's uncorrelated with the selected features, add it to the list
if is_uncorrelated:
selected_features.append(feature)
return selected_features
async def download_data(ticker, con, start_date, end_date): async def download_data(ticker, con, start_date, end_date):
try:
# Define paths to the statement files
statements = [
f"json/financial-statements/ratios/quarter/{ticker}.json",
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
f"json/financial-statements/income-statement/quarter/{ticker}.json",
f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
]
# Helper function to load JSON data asynchronously file_path = f"ml_models/training_data/ai-score/{ticker}.json"
async def load_json_from_file(path):
async with aiofiles.open(path, 'r') as f:
content = await f.read()
return orjson.loads(content)
# Helper function to filter data based on keys and year if os.path.exists(file_path):
async def filter_data(data, ignore_keys, year_threshold=2000): with open(file_path, 'rb') as file:
return [{k: v for k, v in item.items() if k not in ignore_keys} for item in data if int(item["date"][:4]) >= year_threshold] return pd.DataFrame(orjson.loads(file.read()))
else:
# Define keys to ignore try:
ignore_keys = ["symbol", "reportedCurrency", "calendarYear", "fillingDate", "acceptedDate", "period", "cik", "link", "finalLink","pbRatio","ptbRatio"] # Define paths to the statement files
statements = [
f"json/financial-statements/ratios/quarter/{ticker}.json",
f"json/financial-statements/key-metrics/quarter/{ticker}.json",
f"json/financial-statements/cash-flow-statement/quarter/{ticker}.json",
f"json/financial-statements/income-statement/quarter/{ticker}.json",
f"json/financial-statements/balance-sheet-statement/quarter/{ticker}.json",
f"json/financial-statements/income-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/balance-sheet-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/cash-flow-statement-growth/quarter/{ticker}.json",
f"json/financial-statements/owner-earnings/quarter/{ticker}.json",
]
# Load and filter data for each statement type # Helper function to load JSON data asynchronously
async def load_json_from_file(path):
async with aiofiles.open(path, 'r') as f:
content = await f.read()
return orjson.loads(content)
ratios = await load_json_from_file(statements[0]) # Helper function to filter data based on keys and year
ratios = await filter_data(ratios, ignore_keys) async def filter_data(data, ignore_keys, year_threshold=2000):
return [{k: v for k, v in item.items() if k not in ignore_keys} for item in data if int(item["date"][:4]) >= year_threshold]
#Threshold of enough datapoints needed! # Define keys to ignore
if len(ratios) < 50: ignore_keys = ["symbol", "reportedCurrency", "calendarYear", "fillingDate", "acceptedDate", "period", "cik", "link", "finalLink","pbRatio","ptbRatio"]
return
key_metrics = await load_json_from_file(statements[1]) # Load and filter data for each statement type
key_metrics = await filter_data(key_metrics, ignore_keys)
cashflow = await load_json_from_file(statements[2])
cashflow = await filter_data(cashflow, ignore_keys)
income = await load_json_from_file(statements[3]) ratios = await load_json_from_file(statements[0])
income = await filter_data(income, ignore_keys) ratios = await filter_data(ratios, ignore_keys)
balance = await load_json_from_file(statements[4]) #Threshold of enough datapoints needed!
balance = await filter_data(balance, ignore_keys) if len(ratios) < 50:
return
income_growth = await load_json_from_file(statements[5])
income_growth = await filter_data(income_growth, ignore_keys)
balance_growth = await load_json_from_file(statements[6]) key_metrics = await load_json_from_file(statements[1])
balance_growth = await filter_data(balance_growth, ignore_keys) key_metrics = await filter_data(key_metrics, ignore_keys)
cashflow_growth = await load_json_from_file(statements[7])
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
owner_earnings = await load_json_from_file(statements[8])
owner_earnings = await filter_data(owner_earnings, ignore_keys)
# Combine all the data
combined_data = defaultdict(dict)
# Merge the data based on 'date'
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
for entry in entries:
date = entry['date']
for key, value in entry.items():
if key not in combined_data[date]:
combined_data[date][key] = value
combined_data = list(combined_data.values())
# Download historical stock data using yfinance
df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
df = df.rename(columns={'Adj Close': 'close', 'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Volume': 'volume'})
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df['sma_50'] = df['close'].rolling(window=50).mean()
df['sma_200'] = df['close'].rolling(window=200).mean()
df['sma_crossover'] = ((df['sma_50'] > df['sma_200']) & (df['sma_50'].shift(1) <= df['sma_200'].shift(1))).astype(int)
df['ema_50'] = EMAIndicator(close=df['close'], window=50).ema_indicator()
df['ema_200'] = EMAIndicator(close=df['close'], window=200).ema_indicator()
df['ema_crossover'] = ((df['ema_50'] > df['ema_200']) & (df['ema_50'].shift(1) <= df['ema_200'].shift(1))).astype(int)
ichimoku = IchimokuIndicator(high=df['high'], low=df['low'])
df['ichimoku_a'] = ichimoku.ichimoku_a()
df['ichimoku_b'] = ichimoku.ichimoku_b()
df['atr'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close']).average_true_range()
bb = BollingerBands(close=df['close'])
df['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / df['close']
df['volatility'] = df['close'].rolling(window=30).std()
df['daily_return'] = df['close'].pct_change()
df['cumulative_return'] = (1 + df['daily_return']).cumprod() - 1
df['volume_change'] = df['volume'].pct_change()
df['roc'] = df['close'].pct_change(periods=60)
df['avg_volume'] = df['volume'].rolling(window=60).mean()
df['drawdown'] = df['close'] / df['close'].rolling(window=252).max() - 1
df['macd'] = macd(df['close'])
df['macd_signal'] = macd_signal(df['close'])
df['macd_hist'] = 2*macd_diff(df['close'])
df['adx'] = adx(df['high'],df['low'],df['close'])
df["adx_pos"] = adx_pos(df['high'],df['low'],df['close'])
df["adx_neg"] = adx_neg(df['high'],df['low'],df['close'])
df['cci'] = CCIIndicator(high=df['high'], low=df['low'], close=df['close']).cci()
df['mfi'] = MFIIndicator(high=df['high'], low=df['low'], close=df['close'], volume=df['volume']).money_flow_index()
df['nvi'] = NegativeVolumeIndexIndicator(close=df['close'], volume=df['volume']).negative_volume_index()
df['obv'] = OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
df['vpt'] = VolumePriceTrendIndicator(close=df['close'], volume=df['volume']).volume_price_trend()
df['rsi'] = rsi(df["close"], window=60)
df['rolling_rsi'] = df['rsi'].rolling(window=10).mean()
df['stoch_rsi'] = stochrsi_k(df['close'], window=60, smooth1=3, smooth2=3)
df['rolling_stoch_rsi'] = df['stoch_rsi'].rolling(window=10).mean()
df['adi'] = acc_dist_index(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'])
df['cmf'] = chaikin_money_flow(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'], window=20)
df['emv'] = ease_of_movement(high=df['high'],low=df['low'],volume=df['volume'], window=20)
df['fi'] = force_index(close=df['close'], volume=df['volume'], window= 13)
df['williams'] = WilliamsRIndicator(high=df['high'], low=df['low'], close=df['close']).williams_r()
df['kama'] = KAMAIndicator(close=df['close']).kama()
df['stoch'] = stoch(df['high'], df['low'], df['close'], window=30)
df['rocr'] = df['close'] / df['close'].shift(30) - 1 # Rate of Change Ratio (ROCR)
df['ppo'] = (df['ema_50'] - df['ema_200']) / df['ema_50'] * 100
df['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
df['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
df['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
df['tii'] = trend_intensity(df['close'])
ta_indicators = [
'rsi', 'macd', 'macd_signal', 'macd_hist', 'adx', 'adx_pos', 'adx_neg',
'cci', 'mfi', 'nvi', 'obv', 'vpt', 'stoch_rsi','bb_width',
'adi', 'cmf', 'emv', 'fi', 'williams', 'stoch','sma_crossover',
'volatility','daily_return','cumulative_return', 'roc','avg_volume',
'rolling_rsi','rolling_stoch_rsi', 'ema_crossover','ichimoku_a','ichimoku_b',
'atr','kama','rocr','ppo','volatility_ratio','vwap','tii','fdi','drawdown',
'volume_change'
]
# Match each combined data entry with the closest available stock price in df
for item in combined_data:
target_date = item['date']
counter = 0
max_attempts = 10
# Look for the closest matching date in the stock data
while target_date not in df['date'].values and counter < max_attempts:
target_date = (pd.to_datetime(target_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
counter += 1
# If max attempts are reached and no matching date is found, skip the entry
if counter == max_attempts:
continue
# Find the close price for the matching date
close_price = round(df[df['date'] == target_date]['close'].values[0], 2)
item['price'] = close_price
# Dynamically add all indicator values to the combined_data entry
for indicator in ta_indicators:
indicator_value = df[df['date'] == target_date][indicator].values[0] cashflow = await load_json_from_file(statements[2])
item[indicator] = indicator_value # Add the indicator value to the combined_data entry cashflow = await filter_data(cashflow, ignore_keys)
income = await load_json_from_file(statements[3])
income = await filter_data(income, ignore_keys)
balance = await load_json_from_file(statements[4])
balance = await filter_data(balance, ignore_keys)
income_growth = await load_json_from_file(statements[5])
income_growth = await filter_data(income_growth, ignore_keys)
balance_growth = await load_json_from_file(statements[6])
balance_growth = await filter_data(balance_growth, ignore_keys)
cashflow_growth = await load_json_from_file(statements[7])
cashflow_growth = await filter_data(cashflow_growth, ignore_keys)
owner_earnings = await load_json_from_file(statements[8])
owner_earnings = await filter_data(owner_earnings, ignore_keys)
# Combine all the data
combined_data = defaultdict(dict)
# Merge the data based on 'date'
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
for entry in entries:
date = entry['date']
for key, value in entry.items():
if key not in combined_data[date]:
combined_data[date][key] = value
combined_data = list(combined_data.values())
# Download historical stock data using yfinance
df = yf.download(ticker, start=start_date, end=end_date, interval="1d").reset_index()
df = df.rename(columns={'Adj Close': 'close', 'Date': 'date', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Volume': 'volume'})
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# Get the list of columns in df
df_columns = df.columns
df_stats = generate_statistical_features(df)
df_ta = generate_ta_features(df)
# Filter columns in df_stats and df_ta that are not in df
df_stats_filtered = df_stats.drop(columns=df_columns.intersection(df_stats.columns), errors='ignore')
df_ta_filtered = df_ta.drop(columns=df_columns.intersection(df_ta.columns), errors='ignore')
ta_columns = df_ta_filtered.columns.tolist()
stats_columns = df_stats_filtered.columns.tolist()
# Concatenate df with the filtered df_stats and df_ta
df = pd.concat([df, df_ta_filtered, df_stats_filtered], axis=1)
# Match each combined data entry with the closest available stock price in df
for item in combined_data:
target_date = item['date']
counter = 0
max_attempts = 10
# Look for the closest matching date in the stock data
while target_date not in df['date'].values and counter < max_attempts:
target_date = (pd.to_datetime(target_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
counter += 1
# If max attempts are reached and no matching date is found, skip the entry
if counter == max_attempts:
continue
# Find the close price for the matching date
close_price = round(df[df['date'] == target_date]['close'].values[0], 2)
item['price'] = close_price
# Dynamically add all indicator values to the combined_data entry
for column in ta_columns:
column_value = df[df['date'] == target_date][column].values[0]
item[column] = column_value # Add the column value to the combined_data entry
for column in stats_columns:
column_value = df[df['date'] == target_date][column].values[0]
item[column] = column_value # Add the column value to the combined_data entry
# Sort the combined data by date
combined_data = sorted(combined_data, key=lambda x: x['date'])
# Convert combined data into a DataFrame
df_combined = pd.DataFrame(combined_data).dropna()
'''
fundamental_columns = [
'revenue',
'costOfRevenue',
'grossProfit',
'netIncome',
'operatingIncome',
'operatingExpenses',
'researchAndDevelopmentExpenses',
'ebitda',
'freeCashFlow',
'incomeBeforeTax',
'incomeTaxExpense',
'debtRepayment',
'dividendsPaid',
'depreciationAndAmortization',
'netCashUsedProvidedByFinancingActivities',
'changeInWorkingCapital',
'stockBasedCompensation',
'deferredIncomeTax',
'commonStockRepurchased',
'operatingCashFlow',
'capitalExpenditure',
'accountsReceivables',
'purchasesOfInvestments',
'cashAndCashEquivalents',
'shortTermInvestments',
'cashAndShortTermInvestments',
'longTermInvestments',
'otherCurrentLiabilities',
'totalCurrentLiabilities',
'longTermDebt',
'totalDebt',
'netDebt',
'commonStock',
'totalEquity',
'totalLiabilitiesAndStockholdersEquity',
'totalStockholdersEquity',
'totalInvestments',
'taxAssets',
'totalAssets',
'inventory',
'propertyPlantEquipmentNet',
'ownersEarnings',
]
# Compute ratios for all combinations of key elements
new_columns = {}
# Loop over combinations of column pairs
for columns in [fundamental_columns]:
for num, denom in combinations(columns, 2):
# Compute ratio and reverse ratio
ratio = df_combined[num] / df_combined[denom]
reverse_ratio = round(df_combined[denom] / df_combined[num],2)
# Define column names for both ratios
column_name = f'{num}_to_{denom}'
reverse_column_name = f'{denom}_to_{num}'
# Store the new columns in the dictionary, replacing invalid values with 0
new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
# Add all new columns to the original DataFrame at once
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
'''
# To defragment the DataFrame, make a copy
df_combined = df_combined.copy()
df_combined = df_combined.dropna()
df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
# Sort the combined data by date df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
combined_data = sorted(combined_data, key=lambda x: x['date'])
# Convert combined data into a DataFrame
df_combined = pd.DataFrame(combined_data).dropna()
df_copy = df_combined.copy()
key_elements = [ df_copy = df_copy.map(lambda x: round(x, 2) if isinstance(x, float) else x)
'revenue',
'costOfRevenue',
'grossProfit',
'netIncome',
'operatingIncome',
'operatingExpenses',
'researchAndDevelopmentExpenses',
'ebitda',
'freeCashFlow',
'incomeBeforeTax',
'incomeTaxExpense',
'debtRepayment',
'dividendsPaid',
'depreciationAndAmortization',
'netCashUsedProvidedByFinancingActivities',
'changeInWorkingCapital',
'stockBasedCompensation',
'deferredIncomeTax',
'commonStockRepurchased',
'operatingCashFlow',
'capitalExpenditure',
'accountsReceivables',
'purchasesOfInvestments',
'cashAndCashEquivalents',
'shortTermInvestments',
'cashAndShortTermInvestments',
'longTermInvestments',
'otherCurrentLiabilities',
'totalCurrentLiabilities',
'longTermDebt',
'totalDebt',
'netDebt',
'commonStock',
'totalEquity',
'totalLiabilitiesAndStockholdersEquity',
'totalStockholdersEquity',
'totalInvestments',
'taxAssets',
'totalAssets',
'inventory',
'propertyPlantEquipmentNet',
'ownersEarnings',
]
# Compute ratios for all combinations of key elements
new_columns = {} if df_copy.shape[0] > 0:
with open(file_path, 'wb') as file:
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
# Loop over combinations of column pairs return df_copy
for num, denom in combinations(key_elements, 2):
# Compute ratio and reverse ratio
ratio = df_combined[num] / df_combined[denom]
reverse_ratio = df_combined[denom] / df_combined[num]
# Define column names for both ratios except Exception as e:
column_name = f'{num}_to_{denom}' print(e)
reverse_column_name = f'{denom}_to_{num}' pass
# Store the new columns in the dictionary, replacing invalid values with 0
new_columns[column_name] = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
new_columns[reverse_column_name] = np.nan_to_num(reverse_ratio, nan=0, posinf=0, neginf=0)
# Add all new columns to the original DataFrame at once
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns)], axis=1)
# To defragment the DataFrame, make a copy
df_combined = df_combined.copy()
# Create 'Target' column based on price change
df_combined['Target'] = ((df_combined['price'].shift(-1) - df_combined['price']) / df_combined['price'] > 0).astype(int)
# Return a copy of the combined DataFrame
df_combined = df_combined.dropna()
df_combined = df_combined.where(~df_combined.isin([np.inf, -np.inf]), 0)
df_copy = df_combined.copy()
return df_copy
except Exception as e:
print(e)
pass
async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10): async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
@ -327,8 +298,6 @@ async def chunked_gather(tickers, con, start_date, end_date, chunk_size=10):
return results return results
async def warm_start_training(tickers, con): async def warm_start_training(tickers, con):
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d")
@ -336,7 +305,7 @@ async def warm_start_training(tickers, con):
df_test = pd.DataFrame() df_test = pd.DataFrame()
test_size = 0.2 test_size = 0.2
dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=10) dfs = await chunked_gather(tickers, con, start_date, end_date, chunk_size=1)
train_list = [] train_list = []
test_list = [] test_list = []
@ -356,13 +325,14 @@ async def warm_start_training(tickers, con):
# Concatenate all at once outside the loop # Concatenate all at once outside the loop
df_train = pd.concat(train_list, ignore_index=True) df_train = pd.concat(train_list, ignore_index=True)
df_test = pd.concat(test_list, ignore_index=True) df_test = pd.concat(test_list, ignore_index=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
print('======Warm Start Train Set Datapoints======') print('======Warm Start Train Set Datapoints======')
df_train = df_train.sample(frac=1).reset_index(drop=True) #df_train.reset_index(drop=True)
print(len(df_train)) print(len(df_train))
predictor = ScorePredictor() predictor = ScorePredictor()
selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] selected_features = [col for col in df_train if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(df_train, top_n=200)
predictor.warm_start_training(df_train[selected_features], df_train['Target']) predictor.warm_start_training(df_train[selected_features], df_train['Target'])
predictor.evaluate_model(df_test[selected_features], df_test['Target']) predictor.evaluate_model(df_test[selected_features], df_test['Target'])
@ -370,7 +340,7 @@ async def warm_start_training(tickers, con):
async def fine_tune_and_evaluate(ticker, con, start_date, end_date): async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
try: try:
df = await download_data(ticker, con, start_date, end_date) df = await download_data(ticker,con, start_date, end_date)
if df is None or len(df) == 0: if df is None or len(df) == 0:
print(f"No data available for {ticker}") print(f"No data available for {ticker}")
return return
@ -380,7 +350,7 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
train_data = df.iloc[:split_size] train_data = df.iloc[:split_size]
test_data = df.iloc[split_size:] test_data = df.iloc[split_size:]
selected_features = [col for col in df.columns if col not in ['date','price','Target']] selected_features = top_uncorrelated_features(train_data,top_n=50) #[col for col in train_data if col not in ['price', 'date', 'Target']] #top_uncorrelated_features(train_data,top_n=20)
# Fine-tune the model # Fine-tune the model
predictor = ScorePredictor() predictor = ScorePredictor()
predictor.fine_tune_model(train_data[selected_features], train_data['Target']) predictor.fine_tune_model(train_data[selected_features], train_data['Target'])
@ -402,25 +372,27 @@ async def fine_tune_and_evaluate(ticker, con, start_date, end_date):
del predictor # Explicitly delete the predictor to aid garbage collection del predictor # Explicitly delete the predictor to aid garbage collection
async def run(): async def run():
train_mode = False # Set this to False for fine-tuning and evaluation train_mode = True # Set this to False for fine-tuning and evaluation
con = sqlite3.connect('stocks.db') con = sqlite3.connect('stocks.db')
cursor = con.cursor() cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal") cursor.execute("PRAGMA journal_mode = wal")
if train_mode: if train_mode:
# Warm start training # Warm start training
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
warm_start_symbols = [row[0] for row in cursor.fetchall()] warm_start_symbols = [row[0] for row in cursor.fetchall()]
print('Warm Start Training for:', warm_start_symbols) print('Warm Start Training for:', warm_start_symbols)
predictor = await warm_start_training(warm_start_symbols, con) predictor = await warm_start_training(warm_start_symbols, con)
else: else:
# Fine-tuning and evaluation for all stocks # Fine-tuning and evaluation for all stocks
cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'") cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in cursor.fetchall()] stock_symbols = ['AWR'] #[row[0] for row in cursor.fetchall()]
print(f"Total tickers for fine-tuning: {len(stock_symbols)}") print(f"Total tickers for fine-tuning: {len(stock_symbols)}")
start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d") start_date = datetime(1995, 1, 1).strftime("%Y-%m-%d")
end_date = datetime.today().strftime("%Y-%m-%d") end_date = datetime.today().strftime("%Y-%m-%d")
tasks = [] tasks = []
for ticker in tqdm(stock_symbols): for ticker in tqdm(stock_symbols):
await fine_tune_and_evaluate(ticker, con, start_date, end_date) await fine_tune_and_evaluate(ticker, con, start_date, end_date)

View File

@ -1,4 +1,3 @@
from benzinga import financial_data
import requests import requests
from datetime import datetime from datetime import datetime
import numpy as np import numpy as np
@ -11,12 +10,12 @@ from dotenv import load_dotenv
from tqdm import tqdm from tqdm import tqdm
import pandas as pd import pandas as pd
from collections import Counter from collections import Counter
import aiohttp
import asyncio
load_dotenv() load_dotenv()
api_key = os.getenv('BENZINGA_API_KEY') api_key = os.getenv('BENZINGA_API_KEY')
fin = financial_data.Benzinga(api_key)
headers = {"accept": "application/json"} headers = {"accept": "application/json"}
@ -58,7 +57,7 @@ def calculate_rating(data):
last_rating_date = datetime.strptime(last_rating, "%Y-%m-%d") last_rating_date = datetime.strptime(last_rating, "%Y-%m-%d")
difference = (datetime.now() - last_rating_date).days difference = (datetime.now() - last_rating_date).days
except: except:
difference = 1000 # In case of None difference = 1000 # In case of None or invalid date
if total_ratings == 0 or difference >= 600: if total_ratings == 0 or difference >= 600:
return 0 return 0
@ -80,6 +79,7 @@ def calculate_rating(data):
max_rating = 5 max_rating = 5
normalized_rating = min(max(weighted_sum / (weight_return + weight_success_rate + weight_total_ratings + weight_difference), min_rating), max_rating) normalized_rating = min(max(weighted_sum / (weight_return + weight_success_rate + weight_total_ratings + weight_difference), min_rating), max_rating)
# Apply additional conditions based on total ratings and average return
if normalized_rating >= 4: if normalized_rating >= 4:
if total_ratings < 10: if total_ratings < 10:
normalized_rating -= 2.4 normalized_rating -= 2.4
@ -89,78 +89,18 @@ def calculate_rating(data):
normalized_rating -= 0.75 normalized_rating -= 0.75
elif total_ratings < 30: elif total_ratings < 30:
normalized_rating -= 1 normalized_rating -= 1
elif overall_average_return <=10: elif overall_average_return <= 10:
normalized_rating -=1.1 normalized_rating -= 1.1
'''
if overall_average_return <= 0 and overall_average_return >= -5:
normalized_rating = min(normalized_rating - 2, 0)
elif overall_average_return < -5 and overall_average_return >= -10:
normalized_rating = min(normalized_rating - 3, 0)
else:
normalized_rating = min(normalized_rating - 4, 0)
'''
if overall_average_return <= 0:
normalized_rating = min(normalized_rating - 2, 0)
normalized_rating = max(normalized_rating, 0) if overall_average_return <= 0:
normalized_rating = max(normalized_rating - 2, 0)
# Cap the rating if the last rating is older than 30 days
if difference > 30:
normalized_rating = min(normalized_rating, 4.5)
return round(normalized_rating, 2) return round(normalized_rating, 2)
def get_analyst_ratings(analyst_id):
url = "https://api.benzinga.com/api/v2.1/calendar/ratings"
res_list = []
for page in range(0,5):
try:
querystring = {"token":api_key,"parameters[analyst_id]": analyst_id, "page": str(page), "pagesize":"1000"}
response = requests.request("GET", url, headers=headers, params=querystring)
data = ujson.loads(response.text)['ratings']
res_list +=data
time.sleep(2)
except:
break
return res_list
def get_all_analyst_stats():
url = "https://api.benzinga.com/api/v2.1/calendar/ratings/analysts"
res_list = []
for _ in range(0,20): #Run the api N times because not all analyst are counted Bug from benzinga
for page in range(0,100):
try:
querystring = {"token":api_key,"page": f"{page}", 'pagesize': "1000"}
response = requests.request("GET", url, headers=headers, params=querystring)
data = ujson.loads(response.text)['analyst_ratings_analyst']
res_list+=data
except:
break
time.sleep(5)
res_list = remove_duplicates(res_list, 'id') # remove duplicates of analyst
res_list = [item for item in res_list if item.get('ratings_accuracy', {}).get('total_ratings', 0) != 0]
final_list = []
for item in res_list:
analyst_dict = {
'analystName': item['name_full'],
'companyName': item['firm_name'],
'analystId': item['id'],
'firmId': item['firm_id']
}
stats_dict = {
'avgReturn': item['ratings_accuracy'].get('overall_average_return', 0),
'successRate': item['ratings_accuracy'].get('overall_success_rate', 0),
'totalRatings': item['ratings_accuracy'].get('total_ratings', 0),
}
final_list.append({**analyst_dict,**stats_dict})
return final_list
def get_top_stocks(): def get_top_stocks():
with open(f"json/analyst/all-analyst-data.json", 'r') as file: with open(f"json/analyst/all-analyst-data.json", 'r') as file:
analyst_stats_list = ujson.load(file) analyst_stats_list = ujson.load(file)
@ -217,24 +157,97 @@ def get_top_stocks():
ujson.dump(result_sorted, file) ujson.dump(result_sorted, file)
if __name__ == "__main__": async def get_analyst_ratings(analyst_id, session):
url = "https://api.benzinga.com/api/v2.1/calendar/ratings"
res_list = []
for page in range(5):
try:
querystring = {
"token": api_key,
"parameters[analyst_id]": analyst_id,
"page": str(page),
"pagesize": "1000"
}
async with session.get(url, headers=headers, params=querystring) as response:
data = await response.json()
ratings = data.get('ratings', [])
if not ratings:
break # Stop fetching if no more ratings
res_list += ratings
except Exception as e:
#print(f"Error fetching page {page} for analyst {analyst_id}: {e}")
break
return res_list
async def get_all_analyst_stats():
url = "https://api.benzinga.com/api/v2.1/calendar/ratings/analysts"
res_list = []
async with aiohttp.ClientSession() as session:
tasks = [
session.get(url, headers=headers, params={"token": api_key, "page": str(page), 'pagesize': "1000"})
for page in range(100)
]
# Gather responses concurrently
responses = await asyncio.gather(*tasks)
# Process each response
for response in responses:
if response.status == 200: # Check for successful response
try:
data = ujson.loads(await response.text())['analyst_ratings_analyst']
res_list += data
except Exception as e:
pass
print(len(res_list))
# Remove duplicates of analysts and filter based on ratings accuracy
res_list = remove_duplicates(res_list, 'id')
res_list = [item for item in res_list if item.get('ratings_accuracy', {}).get('total_ratings', 0) != 0]
# Construct the final result list
final_list = [{
'analystName': item['name_full'],
'companyName': item['firm_name'],
'analystId': item['id'],
'firmId': item['firm_id'],
'avgReturn': item['ratings_accuracy'].get('overall_average_return', 0),
'successRate': item['ratings_accuracy'].get('overall_success_rate', 0),
'totalRatings': item['ratings_accuracy'].get('total_ratings', 0),
} for item in res_list]
return final_list
async def process_analyst(item, session):
data = await get_analyst_ratings(item['analystId'], session)
item['ratingsList'] = data
item['totalRatings'] = len(data) # True total ratings
item['lastRating'] = data[0]['date'] if data else None
item['numOfStocks'] = len({d['ticker'] for d in data})
# Stats dictionary for calculating score
stats_dict = {
'avgReturn': item.get('avgReturn', 0),
'successRate': item.get('successRate', 0),
'totalRatings': item['totalRatings'],
'lastRating': item['lastRating'],
}
item['analystScore'] = calculate_rating(stats_dict)
async def get_single_analyst_data(analyst_list):
async with aiohttp.ClientSession() as session:
tasks = [process_analyst(item, session) for item in analyst_list]
for task in tqdm(asyncio.as_completed(tasks), total=len(analyst_list)):
await task
async def run():
#Step1 get all analyst id's and stats #Step1 get all analyst id's and stats
analyst_list = get_all_analyst_stats() analyst_list = await get_all_analyst_stats()
print('Number of analyst:', len(analyst_list)) print('Number of analyst:', len(analyst_list))
#Step2 get rating history for each individual analyst and score the analyst #Step2 get rating history for each individual analyst and score the analyst
for item in tqdm(analyst_list): await get_single_analyst_data(analyst_list)
data = get_analyst_ratings(item['analystId'])
item['ratingsList'] = data
item['totalRatings'] = len(data) #true total ratings, which is important for the score
item['lastRating'] = data[0]['date'] if len(data) > 0 else None
item['numOfStocks'] = len({item['ticker'] for item in data})
stats_dict = {
'avgReturn': item.get('avgReturn', 0),
'successRate': item.get('successRate', 0),
'totalRatings': item.get('totalRatings', 0),
'lastRating': item.get('lastRating', None),
}
item['analystScore'] = calculate_rating(stats_dict)
try: try:
con = sqlite3.connect('stocks.db') con = sqlite3.connect('stocks.db')
@ -279,9 +292,8 @@ if __name__ == "__main__":
'successRate': item['successRate'], 'successRate': item['successRate'],
'avgReturn': item['avgReturn'], 'avgReturn': item['avgReturn'],
'totalRatings': item['totalRatings'], 'totalRatings': item['totalRatings'],
'lastRating': item['lastRating'], 'lastRating': item['lastRating']
'mainSectors': item['mainSectors'] })
})
with open(f"json/analyst/top-analysts.json", 'w') as file: with open(f"json/analyst/top-analysts.json", 'w') as file:
ujson.dump(top_analysts_list, file) ujson.dump(top_analysts_list, file)
@ -291,4 +303,8 @@ if __name__ == "__main__":
ujson.dump(analyst_list, file) ujson.dump(analyst_list, file)
#Save top stocks with strong buys from 5 star analysts #Save top stocks with strong buys from 5 star analysts
get_top_stocks() get_top_stocks()
if __name__ == "__main__":
asyncio.run(run())

View File

@ -220,12 +220,12 @@ def run(chunk,analyst_list):
try: try:
stock_con = sqlite3.connect('stocks.db') con = sqlite3.connect('stocks.db')
stock_cursor = stock_con.cursor() stock_cursor = con.cursor()
stock_cursor.execute("SELECT DISTINCT symbol FROM stocks") stock_cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE symbol NOT LIKE '%.%'")
stock_symbols = [row[0] for row in stock_cursor.fetchall()] stock_symbols = [row[0] for row in stock_cursor.fetchall()]
stock_con.close() con.close()
#Save all analyst data in raw form for the next step #Save all analyst data in raw form for the next step
with open(f"json/analyst/all-analyst-data.json", 'r') as file: with open(f"json/analyst/all-analyst-data.json", 'r') as file:

View File

@ -1,7 +1,6 @@
import yfinance as yf import yfinance as yf
import pandas as pd import pandas as pd
from datetime import datetime, timedelta from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
import numpy as np import numpy as np
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
@ -20,13 +19,9 @@ class ScorePredictor:
self.scaler = MinMaxScaler() self.scaler = MinMaxScaler()
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
self.model = XGBClassifier( self.model = XGBClassifier(
n_estimators=200, # Increased from 100 due to problem complexity n_estimators=200,
max_depth=6, # Reduced to prevent overfitting with many features max_depth=10,
learning_rate=0.1, # Added to control the learning process learning_rate=0.1,
colsample_bytree=0.8, # Added to randomly sample columns for each tree
subsample=0.8, # Added to randomly sample training data
reg_alpha=1, # L1 regularization to handle many features
reg_lambda=1, # L2 regularization to handle many features
random_state=42, random_state=42,
n_jobs=10 n_jobs=10
) )

View File

@ -0,0 +1,204 @@
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import RobustScaler
from ta.momentum import *
from ta.trend import *
from ta.volatility import *
from ta.volume import *
def trend_intensity(close, window=20):
ma = close.rolling(window=window).mean()
std = close.rolling(window=window).std()
return ((close - ma) / std).abs().rolling(window=window).mean()
def calculate_fdi(high, low, close, window=30):
n1 = (np.log(high.rolling(window=window).max() - low.rolling(window=window).min()) -
np.log(close.rolling(window=window).max() - close.rolling(window=window).min())) / np.log(2)
return (2 - n1) * 100
def generate_ta_features(df):
df_features = df.copy()
df_features['sma_50'] = df['close'].rolling(window=50).mean()
df_features['sma_200'] = df['close'].rolling(window=200).mean()
df_features['sma_crossover'] = ((df_features['sma_50'] > df_features['sma_200']) & (df_features['sma_50'].shift(1) <= df_features['sma_200'].shift(1))).astype(int)
df_features['ema_50'] = EMAIndicator(close=df['close'], window=50).ema_indicator()
df_features['ema_200'] = EMAIndicator(close=df['close'], window=200).ema_indicator()
df_features['ema_crossover'] = ((df_features['ema_50'] > df_features['ema_200']) & (df_features['ema_50'].shift(1) <= df_features['ema_200'].shift(1))).astype(int)
df_features['wma'] = WMAIndicator(df['close'], window = 30).wma()
ichimoku = IchimokuIndicator(high=df['high'], low=df['low'])
df_features['ichimoku_a'] = ichimoku.ichimoku_a()
df_features['ichimoku_b'] = ichimoku.ichimoku_b()
df_features['atr'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close']).average_true_range()
bb = BollingerBands(close=df['close'])
df_features['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / df['close']
df_features['macd'] = macd(df['close'])
df_features['macd_signal'] = macd_signal(df['close'])
df_features['macd_hist'] = 2*macd_diff(df['close'])
df_features['adx'] = adx(df['high'],df['low'],df['close'])
df_features["adx_pos"] = adx_pos(df['high'],df['low'],df['close'])
df_features["adx_neg"] = adx_neg(df['high'],df['low'],df['close'])
df_features['cci'] = CCIIndicator(high=df['high'], low=df['low'], close=df['close']).cci()
df_features['mfi'] = MFIIndicator(high=df['high'], low=df['low'], close=df['close'], volume=df['volume']).money_flow_index()
df_features['nvi'] = NegativeVolumeIndexIndicator(close=df['close'], volume=df['volume']).negative_volume_index()
df_features['obv'] = OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
df_features['vpt'] = VolumePriceTrendIndicator(close=df['close'], volume=df['volume']).volume_price_trend()
df_features['rsi'] = rsi(df["close"], window=60)
df_features['rolling_rsi'] = df_features['rsi'].rolling(window=10).mean()
df_features['stoch_rsi'] = stochrsi_k(df['close'], window=60, smooth1=3, smooth2=3)
df_features['rolling_stoch_rsi'] = df_features['stoch_rsi'].rolling(window=10).mean()
df_features['adi'] = acc_dist_index(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'])
df_features['cmf'] = chaikin_money_flow(high=df['high'],low=df['low'],close=df['close'],volume=df['volume'], window=20)
df_features['emv'] = ease_of_movement(high=df['high'],low=df['low'],volume=df['volume'], window=20)
df_features['fi'] = force_index(close=df['close'], volume=df['volume'], window= 13)
df_features['williams'] = WilliamsRIndicator(high=df['high'], low=df['low'], close=df['close']).williams_r()
df_features['kama'] = KAMAIndicator(close=df['close']).kama()
stoch = StochasticOscillator(high=df['high'], low=df['low'], close=df['close'], window=60, smooth_window=3)
df_features['stoch_k'] = stoch.stoch()
df_features['stoch_d'] = stoch.stoch_signal()
df_features['rocr'] = df['close'] / df['close'].shift(30) - 1 # Rate of Change Ratio (ROCR)
df_features['ppo'] = (df_features['ema_50'] - df_features['ema_200']) / df_features['ema_50'] * 100
df_features['vwap'] = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
df_features['volatility_ratio'] = df['close'].rolling(window=30).std() / df['close'].rolling(window=60).std()
df_features['fdi'] = calculate_fdi(df['high'], df['low'], df['close'])
df_features['tii'] = trend_intensity(df['close'])
df_features['fft'] = np.abs(np.fft.fft(df['close']))
don_channel = DonchianChannel(high=df['high'], low=df['low'],close=df['close'], window=60)
df_features['don_hband'] = don_channel.donchian_channel_hband()
df_features['don_lband'] = don_channel.donchian_channel_lband()
df_features['don_mband'] = don_channel.donchian_channel_mband()
df_features['don_pband'] = don_channel.donchian_channel_pband()
df_features['don_wband'] = don_channel.donchian_channel_wband()
aroon = AroonIndicator(high=df['high'], low=df['low'], window=60)
df_features['aroon_down'] = aroon.aroon_down()
df_features['aroon_indicator'] = aroon.aroon_indicator()
df_features['aroon_up'] = aroon.aroon_up()
df_features['ultimate_oscillator'] = UltimateOscillator(high=df['high'], low=df['low'], close=df['close']).ultimate_oscillator()
df_features['choppiness'] = 100 * np.log10((df['high'].rolling(window=60).max() - df['low'].rolling(window=30).min()) / df_features['atr']) / np.log10(14)
df_features['ulcer'] = UlcerIndex(df['close'],window=60).ulcer_index()
df_features['keltner_hband'] = keltner_channel_hband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
df_features['keltner_lband'] = keltner_channel_lband_indicator(high=df['high'],low=df['low'],close=df['close'],window=60)
df_features = df_features.dropna()
return df_features
def generate_statistical_features(df, windows=[20, 50], price_col='close',
high_col='high', low_col='low', volume_col='volume'):
"""
Generate comprehensive statistical features for financial time series data.
Parameters:
-----------
df : pandas.DataFrame
DataFrame containing the price and volume data
windows : list
List of rolling window sizes to use for feature generation
price_col : str
Name of the closing price column
high_col : str
Name of the high price column
low_col : str
Name of the low price column
volume_col : str
Name of the volume column
Returns:
--------
pandas.DataFrame
DataFrame with additional statistical features
"""
# Create a copy of the dataframe to avoid modifying the original
df_features = df.copy()
# Calculate features for each window size
for window in windows:
# Returns
df_features[f'returns_{window}'] = df[price_col].pct_change(periods=window)
# Log returns and statistics
log_returns = np.log(df[price_col]/df[price_col].shift(1))
df_features[f'log_returns_{window}'] = log_returns.rolling(window=window).mean()
df_features[f'log_returns_std_{window}'] = log_returns.rolling(window=window).std()
# Statistical moments
df_features[f'std_{window}'] = df[price_col].rolling(window=window).std()
df_features[f'var_{window}'] = df[price_col].rolling(window=window).var()
df_features[f'skew_{window}'] = df[price_col].rolling(window=window).skew()
df_features[f'kurt_{window}'] = df[price_col].rolling(window=window).kurt()
# Volatility measures
df_features[f'realized_vol_{window}'] = (
df_features[f'returns_{window}'].rolling(window=window).std() * np.sqrt(252))
df_features[f'range_vol_{window}'] = (
(df[high_col].rolling(window=window).max() -
df[low_col].rolling(window=window).min()) / df[price_col])
# Z-scores and normalized prices
df_features[f'zscore_{window}'] = (
(df[price_col] - df[price_col].rolling(window=window).mean()) /
df[price_col].rolling(window=window).std())
df_features[f'norm_price_{window}'] = (
df[price_col] / df[price_col].rolling(window=window).mean() - 1)
# Correlation features
if volume_col in df.columns:
df_features[f'volume_price_corr_{window}'] = (
df[price_col].rolling(window=window).corr(df[volume_col]))
df_features[f'high_low_corr_{window}'] = (
df[high_col].rolling(window=window).corr(df[low_col]))
# Quantile features
for q in [0.25, 0.75]:
df_features[f'price_q{int(q*100)}_{window}'] = (
df[price_col].rolling(window=window).quantile(q))
# Price dynamics
df_features['price_acceleration'] = df[price_col].diff().diff()
df_features['momentum_change'] = df[price_col].pct_change().diff()
# Advanced volatility
df_features['parkinson_vol'] = np.sqrt(
1/(4*np.log(2)) * (np.log(df[high_col]/df[low_col])**2))
# Efficiency ratio
df_features['price_efficiency'] = (
abs(df[price_col] - df[price_col].shift(20)) /
(df[high_col].rolling(20).max() - df[low_col].rolling(20).min())
)
# Deviation metrics
df_features['deviation_from_vwap'] = (
(df[price_col] - df[price_col].rolling(window=20).mean()) /
df[price_col].rolling(window=20).mean()
)
df_features['stock_return'] = df['close'].pct_change()
df_features = df_features.dropna()
return df_features