reduce features
This commit is contained in:
parent
8521a4a404
commit
60cd644afa
@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
|
||||
#Threshold of enough datapoints needed!
|
||||
if len(ratios) < 50:
|
||||
print('Not enough data points')
|
||||
return
|
||||
|
||||
|
||||
@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
# Merge the data based on 'date'
|
||||
for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth):
|
||||
for entry in entries:
|
||||
date = entry['date']
|
||||
for key, value in entry.items():
|
||||
if key not in combined_data[date]:
|
||||
combined_data[date][key] = value
|
||||
try:
|
||||
date = entry['date']
|
||||
for key, value in entry.items():
|
||||
if key not in combined_data[date]:
|
||||
combined_data[date][key] = value
|
||||
except:
|
||||
pass
|
||||
|
||||
combined_data = list(combined_data.values())
|
||||
|
||||
@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
fundamental_columns = [
|
||||
'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses',
|
||||
'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense',
|
||||
'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities',
|
||||
'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased',
|
||||
'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments',
|
||||
'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments',
|
||||
'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock',
|
||||
'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments',
|
||||
'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings',
|
||||
'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt',
|
||||
'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets',
|
||||
]
|
||||
|
||||
# Function to compute combinations within a group
|
||||
@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
# Compute combinations for each group of columns
|
||||
compute_column_ratios(fundamental_columns, df_combined, new_columns)
|
||||
compute_column_ratios(stats_columns, df_combined, new_columns)
|
||||
compute_column_ratios(ta_columns, df_combined, new_columns)
|
||||
#compute_column_ratios(ta_columns, df_combined, new_columns)
|
||||
|
||||
# Concatenate the new ratio columns with the original DataFrame
|
||||
df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1)
|
||||
@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading):
|
||||
if not df_copy.empty:
|
||||
with open(file_path, 'wb') as file:
|
||||
file.write(orjson.dumps(df_copy.to_dict(orient='records')))
|
||||
|
||||
print(df_copy)
|
||||
return df_copy
|
||||
|
||||
except Exception as e:
|
||||
@ -270,7 +269,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
||||
tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk]
|
||||
# Await the results for the current chunk
|
||||
chunk_results = await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
train_list = []
|
||||
|
||||
for ticker, df in zip(chunk, chunk_results):
|
||||
@ -324,23 +323,22 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size):
|
||||
data = predictor.evaluate_model(test_data[selected_features], test_data['Target'])
|
||||
|
||||
# Check if the evaluation data meets the criteria
|
||||
'''
|
||||
|
||||
if (data['precision'] >= 50 and data['accuracy'] >= 50 and
|
||||
data['accuracy'] < 100 and data['precision'] < 100 and
|
||||
data['f1_score'] >= 50 and data['recall_score'] >= 50 and
|
||||
data['roc_auc_score'] >= 50):
|
||||
'''
|
||||
# Save the evaluation data to a JSON file
|
||||
await save_json(ticker, data)
|
||||
print(f"Saved results for {ticker}")
|
||||
# Save the evaluation data to a JSON file
|
||||
await save_json(ticker, data)
|
||||
print(f"Saved results for {ticker}")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
|
||||
|
||||
async def warm_start_training(tickers, con, skip_downloading):
|
||||
|
||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220)
|
||||
dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100)
|
||||
|
||||
|
||||
async def run():
|
||||
@ -358,9 +356,8 @@ async def run():
|
||||
WHERE marketCap >= 500E6
|
||||
AND symbol NOT LIKE '%.%'
|
||||
AND symbol NOT LIKE '%-%'
|
||||
ORDER BY marketCap DESC;
|
||||
""")
|
||||
warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()]
|
||||
warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()]
|
||||
|
||||
print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}')
|
||||
await warm_start_training(warm_start_symbols, con, skip_downloading)
|
||||
|
||||
Binary file not shown.
@ -23,13 +23,13 @@ class ScorePredictor:
|
||||
self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance
|
||||
self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl'
|
||||
self.model = lgb.LGBMClassifier(
|
||||
n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time
|
||||
n_estimators=200, # Number of boosting iterations - good balance between performance and training time
|
||||
learning_rate=0.005, # Smaller learning rate for better generalization
|
||||
max_depth=12, # Controlled depth to prevent overfitting
|
||||
num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity
|
||||
max_depth=5, # Controlled depth to prevent overfitting
|
||||
num_leaves=2**5-1, # 2^max_depth, prevents overfitting while maintaining model complexity
|
||||
colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting
|
||||
subsample=0.8, # Use 80% of data per tree to reduce overfitting
|
||||
min_child_samples=20, # Minimum samples per leaf to ensure reliable splits
|
||||
min_child_samples=5, # Minimum samples per leaf to ensure reliable splits
|
||||
random_state=42, # For reproducibility
|
||||
reg_alpha=0.1, # L1 regularization
|
||||
reg_lambda=0.1, # L2 regularization
|
||||
|
||||
Binary file not shown.
@ -103,7 +103,7 @@ def generate_ta_features(df):
|
||||
df_features = df_features.dropna()
|
||||
return df_features
|
||||
|
||||
def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
||||
def generate_statistical_features(df, windows=[50,200], price_col='close',
|
||||
high_col='high', low_col='low', volume_col='volume'):
|
||||
"""
|
||||
Generate comprehensive statistical features for financial time series data.
|
||||
@ -160,23 +160,7 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close',
|
||||
df_features[f'zscore_{window}'] = (
|
||||
(df[price_col] - df[price_col].rolling(window=window).mean()) /
|
||||
df[price_col].rolling(window=window).std())
|
||||
df_features[f'norm_price_{window}'] = (
|
||||
df[price_col] / df[price_col].rolling(window=window).mean() - 1)
|
||||
|
||||
|
||||
# Correlation features
|
||||
if volume_col in df.columns:
|
||||
df_features[f'volume_price_corr_{window}'] = (
|
||||
df[price_col].rolling(window=window).corr(df[volume_col]))
|
||||
df_features[f'high_low_corr_{window}'] = (
|
||||
df[high_col].rolling(window=window).corr(df[low_col]))
|
||||
|
||||
|
||||
|
||||
# Quantile features
|
||||
for q in [0.25, 0.75]:
|
||||
df_features[f'price_q{int(q*100)}_{window}'] = (
|
||||
df[price_col].rolling(window=window).quantile(q))
|
||||
|
||||
# Price dynamics
|
||||
df_features['price_acceleration'] = df[price_col].diff().diff()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user