diff --git a/app/cron_ai_score.py b/app/cron_ai_score.py index bbeaeae..563a355 100644 --- a/app/cron_ai_score.py +++ b/app/cron_ai_score.py @@ -119,6 +119,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): #Threshold of enough datapoints needed! if len(ratios) < 50: + print('Not enough data points') return @@ -128,10 +129,13 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): # Merge the data based on 'date' for entries in zip(ratios,key_metrics,income, balance, cashflow, owner_earnings, income_growth, balance_growth, cashflow_growth): for entry in entries: - date = entry['date'] - for key, value in entry.items(): - if key not in combined_data[date]: - combined_data[date][key] = value + try: + date = entry['date'] + for key, value in entry.items(): + if key not in combined_data[date]: + combined_data[date][key] = value + except: + pass combined_data = list(combined_data.values()) @@ -193,13 +197,8 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): fundamental_columns = [ 'revenue', 'costOfRevenue', 'grossProfit', 'netIncome', 'operatingIncome', 'operatingExpenses', 'researchAndDevelopmentExpenses', 'ebitda', 'freeCashFlow', 'incomeBeforeTax', 'incomeTaxExpense', - 'debtRepayment', 'dividendsPaid', 'depreciationAndAmortization', 'netCashUsedProvidedByFinancingActivities', - 'changeInWorkingCapital', 'stockBasedCompensation', 'deferredIncomeTax', 'commonStockRepurchased', - 'operatingCashFlow', 'capitalExpenditure', 'accountsReceivables', 'purchasesOfInvestments', - 'cashAndCashEquivalents', 'shortTermInvestments', 'cashAndShortTermInvestments', 'longTermInvestments', - 'otherCurrentLiabilities', 'totalCurrentLiabilities', 'longTermDebt', 'totalDebt', 'netDebt', 'commonStock', - 'totalEquity', 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments', - 'taxAssets', 'totalAssets', 'inventory', 'propertyPlantEquipmentNet', 'ownersEarnings', + 'operatingCashFlow','cashAndCashEquivalents', 'totalEquity','otherCurrentLiabilities', 'totalCurrentLiabilities', 'totalDebt', + 'totalLiabilitiesAndStockholdersEquity', 'totalStockholdersEquity', 'totalInvestments','totalAssets', ] # Function to compute combinations within a group @@ -226,7 +225,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): # Compute combinations for each group of columns compute_column_ratios(fundamental_columns, df_combined, new_columns) compute_column_ratios(stats_columns, df_combined, new_columns) - compute_column_ratios(ta_columns, df_combined, new_columns) + #compute_column_ratios(ta_columns, df_combined, new_columns) # Concatenate the new ratio columns with the original DataFrame df_combined = pd.concat([df_combined, pd.DataFrame(new_columns, index=df_combined.index)], axis=1) @@ -244,7 +243,7 @@ async def download_data(ticker, con, start_date, end_date, skip_downloading): if not df_copy.empty: with open(file_path, 'wb') as file: file.write(orjson.dumps(df_copy.to_dict(orient='records'))) - + print(df_copy) return df_copy except Exception as e: @@ -270,7 +269,7 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): tasks = [download_data(ticker, con, start_date, end_date, skip_downloading) for ticker in chunk] # Await the results for the current chunk chunk_results = await asyncio.gather(*tasks) - + train_list = [] for ticker, df in zip(chunk, chunk_results): @@ -324,23 +323,22 @@ async def chunked_gather(tickers, con, skip_downloading, chunk_size): data = predictor.evaluate_model(test_data[selected_features], test_data['Target']) # Check if the evaluation data meets the criteria - ''' + if (data['precision'] >= 50 and data['accuracy'] >= 50 and data['accuracy'] < 100 and data['precision'] < 100 and data['f1_score'] >= 50 and data['recall_score'] >= 50 and data['roc_auc_score'] >= 50): - ''' - # Save the evaluation data to a JSON file - await save_json(ticker, data) - print(f"Saved results for {ticker}") + # Save the evaluation data to a JSON file + await save_json(ticker, data) + print(f"Saved results for {ticker}") except Exception as e: print(e) pass - + async def warm_start_training(tickers, con, skip_downloading): - dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=220) + dfs = await chunked_gather(tickers, con, skip_downloading, chunk_size=100) async def run(): @@ -358,9 +356,8 @@ async def run(): WHERE marketCap >= 500E6 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%' - ORDER BY marketCap DESC; """) - warm_start_symbols = ['A'] #[row[0] for row in cursor.fetchall()] + warm_start_symbols = ['AAPL'] #[row[0] for row in cursor.fetchall()] print(f'Warm Start Training: Total Tickers {len(warm_start_symbols)}') await warm_start_training(warm_start_symbols, con, skip_downloading) diff --git a/app/ml_models/__pycache__/score_model.cpython-310.pyc b/app/ml_models/__pycache__/score_model.cpython-310.pyc index bfb68ad..9846f13 100644 Binary files a/app/ml_models/__pycache__/score_model.cpython-310.pyc and b/app/ml_models/__pycache__/score_model.cpython-310.pyc differ diff --git a/app/ml_models/score_model.py b/app/ml_models/score_model.py index e0bc768..dc45468 100644 --- a/app/ml_models/score_model.py +++ b/app/ml_models/score_model.py @@ -23,13 +23,13 @@ class ScorePredictor: self.pca = PCA(n_components=0.95) # Retain components explaining 95% variance self.warm_start_model_path = 'ml_models/weights/ai-score/warm_start_weights.pkl' self.model = lgb.LGBMClassifier( - n_estimators=20_000, # Number of boosting iterations - good balance between performance and training time + n_estimators=200, # Number of boosting iterations - good balance between performance and training time learning_rate=0.005, # Smaller learning rate for better generalization - max_depth=12, # Controlled depth to prevent overfitting - num_leaves=2**12, # 2^max_depth, prevents overfitting while maintaining model complexity + max_depth=5, # Controlled depth to prevent overfitting + num_leaves=2**5-1, # 2^max_depth, prevents overfitting while maintaining model complexity colsample_bytree=0.8, # Use 80% of features per tree to reduce overfitting subsample=0.8, # Use 80% of data per tree to reduce overfitting - min_child_samples=20, # Minimum samples per leaf to ensure reliable splits + min_child_samples=5, # Minimum samples per leaf to ensure reliable splits random_state=42, # For reproducibility reg_alpha=0.1, # L1 regularization reg_lambda=0.1, # L2 regularization diff --git a/app/utils/__pycache__/feature_engineering.cpython-310.pyc b/app/utils/__pycache__/feature_engineering.cpython-310.pyc index 842219b..1929f3d 100644 Binary files a/app/utils/__pycache__/feature_engineering.cpython-310.pyc and b/app/utils/__pycache__/feature_engineering.cpython-310.pyc differ diff --git a/app/utils/feature_engineering.py b/app/utils/feature_engineering.py index 46f72ca..aa38e2a 100644 --- a/app/utils/feature_engineering.py +++ b/app/utils/feature_engineering.py @@ -103,7 +103,7 @@ def generate_ta_features(df): df_features = df_features.dropna() return df_features -def generate_statistical_features(df, windows=[20, 50], price_col='close', +def generate_statistical_features(df, windows=[50,200], price_col='close', high_col='high', low_col='low', volume_col='volume'): """ Generate comprehensive statistical features for financial time series data. @@ -160,23 +160,7 @@ def generate_statistical_features(df, windows=[20, 50], price_col='close', df_features[f'zscore_{window}'] = ( (df[price_col] - df[price_col].rolling(window=window).mean()) / df[price_col].rolling(window=window).std()) - df_features[f'norm_price_{window}'] = ( - df[price_col] / df[price_col].rolling(window=window).mean() - 1) - - # Correlation features - if volume_col in df.columns: - df_features[f'volume_price_corr_{window}'] = ( - df[price_col].rolling(window=window).corr(df[volume_col])) - df_features[f'high_low_corr_{window}'] = ( - df[high_col].rolling(window=window).corr(df[low_col])) - - - - # Quantile features - for q in [0.25, 0.75]: - df_features[f'price_q{int(q*100)}_{window}'] = ( - df[price_col].rolling(window=window).quantile(q)) # Price dynamics df_features['price_acceleration'] = df[price_col].diff().diff()