update reddit cron job

This commit is contained in:
MuslemRahimi 2024-07-27 16:18:40 +02:00
parent 343797b70c
commit ba912469b3
3 changed files with 81 additions and 85 deletions

View File

@ -1,26 +1,37 @@
import json import json
import re import re
import requests import requests
import praw
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
def get_subscriber_count(): import os
url = "https://www.reddit.com/r/wallstreetbets/new.json" from dotenv import load_dotenv
headers = {'User-agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers) load_dotenv()
if response.status_code == 200: client_key = os.getenv('REDDIT_API_KEY')
data = response.json() client_secret = os.getenv('REDDIT_API_SECRET')
return data['data']['children'][0]['data']['subreddit_subscribers'] user_agent = os.getenv('REDDIT_USER_AGENT')
return None
# Initialize Reddit instance
reddit = praw.Reddit(
client_id=client_key,
client_secret=client_secret,
user_agent=user_agent
)
# Function to save data
def save_data(data):
with open('json/reddit-tracker/wallstreetbets/stats.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def compute_daily_statistics(file_path): def compute_daily_statistics(file_path):
# Load the data from the JSON file # Load the data from the JSON file
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
# Get current subscriber count
subscriber_count = get_subscriber_count()
# Initialize a defaultdict to store daily statistics # Initialize a defaultdict to store daily statistics
daily_stats = defaultdict(lambda: { daily_stats = defaultdict(lambda: {
'post_count': 0, 'post_count': 0,
@ -56,7 +67,6 @@ def compute_daily_statistics(file_path):
'date': date.isoformat(), 'date': date.isoformat(),
'totalPosts': stats['post_count'], 'totalPosts': stats['post_count'],
'totalComments': stats['total_comments'], 'totalComments': stats['total_comments'],
'subscribersCount': subscriber_count,
'totalMentions': sum(stats['ticker_mentions'].values()), 'totalMentions': sum(stats['ticker_mentions'].values()),
'companySpread': len(stats['unique_tickers']), 'companySpread': len(stats['unique_tickers']),
'tickerMentions': dict(stats['ticker_mentions']) # Optional: include detailed ticker mentions 'tickerMentions': dict(stats['ticker_mentions']) # Optional: include detailed ticker mentions
@ -67,4 +77,4 @@ def compute_daily_statistics(file_path):
# Usage # Usage
file_path = 'json/reddit-tracker/wallstreetbets/data.json' file_path = 'json/reddit-tracker/wallstreetbets/data.json'
daily_statistics = compute_daily_statistics(file_path) daily_statistics = compute_daily_statistics(file_path)
print(json.dumps(daily_statistics, indent=2)) save_data(daily_statistics)

View File

@ -1,18 +1,18 @@
import requests import praw
import json import json
from datetime import datetime from datetime import datetime
import os import os
from dotenv import load_dotenv
import time
load_dotenv()
client_key = os.getenv('REDDIT_API_KEY')
client_secret = os.getenv('REDDIT_API_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')
# URL of the Reddit API endpoint
url = "https://www.reddit.com/r/wallstreetbets/new.json"
# File path for the JSON data # File path for the JSON data
file_path = 'json/reddit-tracker/wallstreetbets/data.json' file_path = 'json/reddit-tracker/wallstreetbets/data.json'
headers = {
'User-Agent': 'python:myapp:v1.0 (by /u/realstocknear)'
}
# Ensure the directory exists # Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True) os.makedirs(os.path.dirname(file_path), exist_ok=True)
@ -28,14 +28,12 @@ def save_data(data):
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4) json.dump(data, f, ensure_ascii=False, indent=4)
# Function to get updated post data # Initialize Reddit instance
def get_updated_post_data(permalink): reddit = praw.Reddit(
post_url = f"https://www.reddit.com{permalink}.json" client_id=client_key,
response = requests.get(post_url, headers=headers) client_secret=client_secret,
if response.status_code == 200: user_agent=user_agent
post_data = response.json()[0]['data']['children'][0]['data'] )
return post_data
return None
# Load existing data # Load existing data
existing_data = load_existing_data() existing_data = load_existing_data()
@ -43,53 +41,42 @@ existing_data = load_existing_data()
# Create a dictionary of existing posts for faster lookup and update # Create a dictionary of existing posts for faster lookup and update
existing_posts = {post['id']: post for post in existing_data} existing_posts = {post['id']: post for post in existing_data}
# Send a GET request to the API # Flag to check if any data was added or updated
response = requests.get(url, headers=headers) data_changed = False
counter = 0 # Get the subreddit
# Check if the request was successful subreddit = reddit.subreddit("wallstreetbets")
if response.status_code == 200:
# Parse the JSON data
data = response.json()
# Flag to check if any data was added or updated
data_changed = False
# Iterate through each post in the 'children' list
for post in data['data']['children']:
post_data = post['data']
post_id = post_data.get('id', '')
# Iterate through new submissions
for submission in subreddit.new(limit=1000):
post_id = submission.id
# Check if this post is already in our data # Check if this post is already in our data
if post_id in existing_posts: if post_id in existing_posts:
# Update existing post # Update existing post
if counter < 25: #Only update the latest 25 posts to not overload the reddit server existing_posts[post_id]['upvote_ratio'] = submission.upvote_ratio
updated_data = get_updated_post_data(post_data['permalink']) existing_posts[post_id]['num_comments'] = submission.num_comments
if updated_data:
existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio'])
existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments'])
data_changed = True data_changed = True
counter +=1
print(counter)
else: else:
# Extract the required fields for new post # Extract the required fields for new post
extracted_post = { extracted_post = {
"id": post_id, "id": post_id,
"permalink": post_data.get('permalink', ''), "permalink": submission.permalink,
"title": post_data.get('title', ''), "title": submission.title,
"selftext": post_data.get('selftext', ''), "selftext": submission.selftext,
"created_utc": post_data.get('created_utc', ''), "created_utc": int(submission.created_utc),
"upvote_ratio": post_data.get('upvote_ratio', ''), "upvote_ratio": submission.upvote_ratio,
"num_comments": post_data.get('num_comments', ''), "num_comments": submission.num_comments,
"link_flair_text": post_data.get('link_flair_text', ''), "link_flair_text": submission.link_flair_text,
"author": post_data.get('author', ''), "author": str(submission.author),
} }
# Add the new post to the existing data # Add the new post to the existing data
existing_posts[post_id] = extracted_post existing_posts[post_id] = extracted_post
data_changed = True data_changed = True
if data_changed: time.sleep(1) # Add a 1-second delay between processing submissions
if data_changed:
# Convert the dictionary back to a list and sort by created_utc # Convert the dictionary back to a list and sort by created_utc
updated_data = list(existing_posts.values()) updated_data = list(existing_posts.values())
updated_data.sort(key=lambda x: x['created_utc'], reverse=True) updated_data.sort(key=lambda x: x['created_utc'], reverse=True)
@ -97,7 +84,5 @@ if response.status_code == 200:
# Save the updated data # Save the updated data
save_data(updated_data) save_data(updated_data)
print(f"Data updated and saved to {file_path}") print(f"Data updated and saved to {file_path}")
else:
print("No new data to add or update.")
else: else:
print(f"Failed to retrieve data. Status code: {response.status_code}") print("No new data to add or update.")

View File

@ -37,3 +37,4 @@ finnhub-python
intrinio_sdk intrinio_sdk
openai openai
slowapi slowapi
praw