backend/app/cron_reddit_statistics.py
2024-11-22 15:48:56 +01:00

200 lines
7.3 KiB
Python

import json
import re
import requests
import praw
from datetime import datetime, timedelta
from collections import defaultdict
import os
from dotenv import load_dotenv
import sqlite3
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
# Download required NLTK data
nltk.download('vader_lexicon', quiet=True)
# Initialize the NLTK sentiment analyzer
sia = SentimentIntensityAnalyzer()
con = sqlite3.connect('stocks.db')
cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal")
cursor.execute("SELECT DISTINCT symbol FROM stocks")
stock_symbols = [row[0] for row in cursor.fetchall()]
etf_con = sqlite3.connect('etf.db')
etf_cursor = etf_con.cursor()
etf_cursor.execute("PRAGMA journal_mode = wal")
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
etf_symbols = [row[0] for row in etf_cursor.fetchall()]
total_symbols = stock_symbols + etf_symbols
con.close()
etf_con.close()
load_dotenv()
client_key = os.getenv('REDDIT_API_KEY')
client_secret = os.getenv('REDDIT_API_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')
# Initialize Reddit instance
reddit = praw.Reddit(
client_id=client_key,
client_secret=client_secret,
user_agent=user_agent
)
# Function to save data
def save_data(data, filename):
with open(f'json/reddit-tracker/wallstreetbets/{filename}', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def compute_daily_statistics(file_path):
# Load the data from the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Initialize a defaultdict to store daily statistics
daily_stats = defaultdict(lambda: {
'post_count': 0,
'total_comments': 0,
'ticker_mentions': defaultdict(lambda: {'total': 0, 'PUT': 0, 'CALL': 0, 'sentiment': []}),
'unique_tickers': set()
})
# Compile regex patterns for finding tickers, PUT, and CALL
ticker_pattern = re.compile(r'\$([A-Z]+)')
put_pattern = re.compile(r'\b(PUT|PUTS)\b', re.IGNORECASE)
call_pattern = re.compile(r'\b(CALL|CALLS)\b', re.IGNORECASE)
# Process each post
for post in data:
# Convert UTC timestamp to datetime object
post_date = datetime.utcfromtimestamp(post['created_utc']).date()
# Update statistics for this day
daily_stats[post_date]['post_count'] += 1
daily_stats[post_date]['total_comments'] += post['num_comments']
# Find ticker mentions in title and selftext
text_to_search = post['title'] + ' ' + post['selftext']
tickers = ticker_pattern.findall(text_to_search)
# Check for PUT and CALL mentions
put_mentions = len(put_pattern.findall(text_to_search))
call_mentions = len(call_pattern.findall(text_to_search))
# Perform sentiment analysis
sentiment_scores = sia.polarity_scores(text_to_search)
for ticker in tickers:
daily_stats[post_date]['ticker_mentions'][ticker]['total'] += 1
daily_stats[post_date]['unique_tickers'].add(ticker)
# Add PUT and CALL counts
daily_stats[post_date]['ticker_mentions'][ticker]['PUT'] += put_mentions
daily_stats[post_date]['ticker_mentions'][ticker]['CALL'] += call_mentions
# Add sentiment score
daily_stats[post_date]['ticker_mentions'][ticker]['sentiment'].append(sentiment_scores['compound'])
# Calculate averages and format the results
formatted_stats = []
for date, stats in sorted(daily_stats.items(), reverse=True):
formatted_stats.append({
'date': date.isoformat(),
'totalPosts': stats['post_count'],
'totalComments': stats['total_comments'],
'totalMentions': sum(mentions['total'] for mentions in stats['ticker_mentions'].values()),
'companySpread': len(stats['unique_tickers']),
'tickerMentions': [
{
'symbol': ticker,
'count': mentions['total'],
'put': mentions['PUT'],
'call': mentions['CALL']
}
for ticker, mentions in stats['ticker_mentions'].items()
]
})
return formatted_stats, daily_stats
def compute_trending_tickers(daily_stats):
today = datetime.now().date()
period_list = [2,7,30,90]
res_dict = {}
for time_period in period_list:
res_list = []
N_day_ago = today - timedelta(days=time_period)
trending = defaultdict(lambda: {'total': 0, 'PUT': 0, 'CALL': 0, 'sentiment': []})
for date, stats in daily_stats.items():
if N_day_ago <= date <= today:
for ticker, counts in stats['ticker_mentions'].items():
trending[ticker]['total'] += counts['total']
trending[ticker]['PUT'] += counts['PUT']
trending[ticker]['CALL'] += counts['CALL']
trending[ticker]['sentiment'].extend(counts['sentiment'])
res_list = [
{
'symbol': symbol,
'count': counts['total'],
'put': counts['PUT'],
'call': counts['CALL'],
'avgSentiment': round(sum(counts['sentiment']) / len(counts['sentiment']),2) if counts['sentiment'] else 0
}
for symbol, counts in trending.items() if symbol in total_symbols
]
res_list.sort(key=lambda x: x['count'], reverse=True)
for item in res_list:
symbol = item['symbol']
try:
with open(f'json/quote/{symbol}.json') as f:
data = json.load(f)
name = data['name']
price = round(data['price'],2)
changes_percentage = round(data['changesPercentage'],2)
except Exception as e:
print(e)
name = None
price = None
changes_percentage = None
if symbol in stock_symbols:
item['assetType'] = 'stocks'
item['name'] = name
item['price'] = price
item['changesPercentage'] = changes_percentage
elif symbol in etf_symbols:
item['assetType'] = 'etf'
item['name'] = name
item['price'] = price
item['changesPercentage'] = changes_percentage
else:
item['assetType'] = ''
if time_period == 2:
res_dict['oneDay'] = res_list
elif time_period == 7:
res_dict['oneWeek'] = res_list
elif time_period == 30:
res_dict['oneMonth'] = res_list
elif time_period == 90:
res_dict['threeMonths'] = res_list
return res_dict
# Usage
file_path = 'json/reddit-tracker/wallstreetbets/data.json'
daily_statistics, daily_stats_dict = compute_daily_statistics(file_path)
save_data(daily_statistics, 'stats.json')
# Compute and save trending tickers
trending_tickers = compute_trending_tickers(daily_stats_dict)
save_data(trending_tickers, 'trending.json')