reduce features
This commit is contained in:
parent
8521a4a404
commit
60cd644afa
@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
|
|
||||||
#Threshold of enough datapoints needed!
|
#Threshold of enough datapoints needed!
|
||||||
if len(ratios) < 50:
|
if len(ratios) < 50:
|
||||||
|
print('Not enough data points')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
# Merge the data based on 'date'
|
# Merge the data based on 'date'
|
||||||
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
|
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
date = entry['date']
|
try:
|
||||||
for key, value in entry.items():
|
date = entry['date']
|
||||||
if key not in combined_data[date]:
|
for key, value in entry.items():
|
||||||
combined_data[date][key] = value
|
if key not in combined_data[date]:
|
||||||
|
combined_data[date][key] = value
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
combined_data = list(combined_data.values())
|
combined_data = list(combined_data.values())
|
||||||
|
|
||||||
@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
fundamental_columns = [
|
fundamental_columns = [
|
||||||
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
|
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
|
||||||
'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
|
'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
|
||||||
'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities',
|
'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
|
||||||
'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased',
|
'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
|
||||||
'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments',
|
|
||||||
'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments',
|
|
||||||
'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock',
|
|
||||||
'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments',
|
|
||||||
'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Function to compute combinations within a group
|
# Function to compute combinations within a group
|
||||||
@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
# Compute combinations for each group of columns
|
# Compute combinations for each group of columns
|
||||||
compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
||||||
compute_column_ratios(stats_columns, df_combined, new_columns)
|
compute_column_ratios(stats_columns, df_combined, new_columns)
|
||||||
compute_column_ratios(ta_columns, df_combined, new_columns)
|
#compute_column_ratios(ta_columns, df_combined, new_columns)
|
||||||
|
|
||||||
# Concatenate the new ratio columns with the original DataFrame
|
# Concatenate the new ratio columns with the original DataFrame
|
||||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
||||||
@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
|||||||
if not df_copy.empty:
|
if not df_copy.empty:
|
||||||
with open(file_path, 'wb') as file:
|
with open(file_path, 'wb') as file:
|
||||||
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
||||||
|
print(df_copy)
|
||||||
return df_copy
|
return df_copy
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -324,15 +323,14 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||||
|
|
||||||
# Check if the evaluation data meets the criteria
|
# Check if the evaluation data meets the criteria
|
||||||
'''
|
|
||||||
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
||||||
data['accuracy'] < 100 and data['precision'] < 100 and
|
data['accuracy'] < 100 and data['precision'] < 100 and
|
||||||
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
||||||
data['roc_auc_score'] >= 50):
|
data['roc_auc_score'] >= 50):
|
||||||
'''
|
# Save the evaluation data to a JSON file
|
||||||
# Save the evaluation data to a JSON file
|
await save_json(ticker, data)
|
||||||
await save_json(ticker, data)
|
print(f"Saved results for {ticker}")
|
||||||
print(f"Saved results for {ticker}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
pass
|
pass
|
||||||
@ -340,7 +338,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
|||||||
|
|
||||||
async def warm_start_training(tickers, con, skip_downloading):
|
async def warm_start_training(tickers, con, skip_downloading):
|
||||||
|
|
||||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
|
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
|
||||||
|
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
@ -358,9 +356,8 @@ async def run():
|
|||||||
WHERE marketCap >= 500E6
|
WHERE marketCap >= 500E6
|
||||||
AND symbol NOT LIKE '%.%'
|
AND symbol NOT LIKE '%.%'
|
||||||
AND symbol NOT LIKE '%-%'
|
AND symbol NOT LIKE '%-%'
|
||||||
ORDER BY marketCap DESC;
|
|
||||||
""")
|
""")
|
||||||
warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
|
warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
||||||
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
||||||
|
|||||||
Binary file not shown.
@ -23,13 +23,13 @@ class ScorePredictor:
|
|||||||
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
||||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||||
self.model = lgb.LGBMClassifier(
|
self.model = lgb.LGBMClassifier(
|
||||||
n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time
|
n_estimators=200, # Number of boosting iterations - good balance between performance and training time
|
||||||
learning_rate=0.005, # Smaller learning rate for better generalization
|
learning_rate=0.005, # Smaller learning rate for better generalization
|
||||||
max_depth=12, # Controlled depth to prevent overfitting
|
max_depth=5, # Controlled depth to prevent overfitting
|
||||||
num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity
|
num_leaves=2**5-1, # 2^max_depth, prevents overfitting while maintaining model complexity
|
||||||
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
||||||
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
||||||
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits
|
min_child_samples=5, # Minimum samples per leaf to ensure reliable splits
|
||||||
random_state=42, # For reproducibility
|
random_state=42, # For reproducibility
|
||||||
reg_alpha=0.1, # L1 regularization
|
reg_alpha=0.1, # L1 regularization
|
||||||
reg_lambda=0.1, # L2 regularization
|
reg_lambda=0.1, # L2 regularization
|
||||||
|
|||||||
Binary file not shown.
@ -103,7 +103,7 @@ def generate_ta_features(df):
|
|||||||
df_features = df_features.dropna()
|
df_features = df_features.dropna()
|
||||||
return df_features
|
return df_features
|
||||||
|
|
||||||
def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
def generate_statistical_features(df, windows=[50,200], price_col='close',
|
||||||
high_col='high', low_col='low', volume_col='volume'):
|
high_col='high', low_col='low', volume_col='volume'):
|
||||||
"""
|
"""
|
||||||
Generate comprehensive statistical features for financial time series data.
|
Generate comprehensive statistical features for financial time series data.
|
||||||
@ -160,24 +160,8 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
|||||||
df_features[f'zscore_{window}'] = (
|
df_features[f'zscore_{window}'] = (
|
||||||
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
||||||
df[price_col].rolling(window=window).std())
|
df[price_col].rolling(window=window).std())
|
||||||
df_features[f'norm_price_{window}'] = (
|
|
||||||
df[price_col] / df[price_col].rolling(window=window).mean() - 1)
|
|
||||||
|
|
||||||
|
|
||||||
# Correlation features
|
|
||||||
if volume_col in df.columns:
|
|
||||||
df_features[f'volume_price_corr_{window}'] = (
|
|
||||||
df[price_col].rolling(window=window).corr(df[volume_col]))
|
|
||||||
df_features[f'high_low_corr_{window}'] = (
|
|
||||||
df[high_col].rolling(window=window).corr(df[low_col]))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Quantile features
|
|
||||||
for q in [0.25, 0.75]:
|
|
||||||
df_features[f'price_q{int(q*100)}_{window}'] = (
|
|
||||||
df[price_col].rolling(window=window).quantile(q))
|
|
||||||
|
|
||||||
# Price dynamics
|
# Price dynamics
|
||||||
df_features['price_acceleration'] = df[price_col].diff().diff()
|
df_features['price_acceleration'] = df[price_col].diff().diff()
|
||||||
df_features['momentum_change'] = df[price_col].pct_change().diff()
|
df_features['momentum_change'] = df[price_col].pct_change().diff()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user