update reddit cron job

2024-07-27 16:18:40 +02:00 · 2024-07-27 16:18:40 +02:00 · ba912469b3
commit ba912469b3
parent 343797b70c
3 changed files with 81 additions and 85 deletions
--- a/app/cron_reddit_statistics.py
+++ b/app/cron_reddit_statistics.py
@ -1,26 +1,37 @@
 import json
 import re
 import requests
 import praw
 from datetime import datetime
 from collections import defaultdict
-def get_subscriber_count():
+import os
-    url = "https://www.reddit.com/r/wallstreetbets/new.json"
+from dotenv import load_dotenv
-    headers = {'User-agent': 'Mozilla/5.0'}
+
-    response = requests.get(url, headers=headers)
+load_dotenv()
-    if response.status_code == 200:
+client_key = os.getenv('REDDIT_API_KEY')
-        data = response.json()
+client_secret = os.getenv('REDDIT_API_SECRET')
-        return data['data']['children'][0]['data']['subreddit_subscribers']
+user_agent = os.getenv('REDDIT_USER_AGENT')
-    return None
+
 # Initialize Reddit instance
 reddit = praw.Reddit(
    client_id=client_key,
    client_secret=client_secret,
    user_agent=user_agent
 )
 # Function to save data
 def save_data(data):
    with open('json/reddit-tracker/wallstreetbets/stats.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
 def compute_daily_statistics(file_path):
    # Load the data from the JSON file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Get current subscriber count
    subscriber_count = get_subscriber_count()
    # Initialize a defaultdict to store daily statistics
    daily_stats = defaultdict(lambda: {
        'post_count': 0, 
@ -56,7 +67,6 @@ def compute_daily_statistics(file_path):
            'date': date.isoformat(),
            'totalPosts': stats['post_count'],
            'totalComments': stats['total_comments'],
            'subscribersCount': subscriber_count,
            'totalMentions': sum(stats['ticker_mentions'].values()),
            'companySpread': len(stats['unique_tickers']),
            'tickerMentions': dict(stats['ticker_mentions'])  # Optional: include detailed ticker mentions
@ -67,4 +77,4 @@ def compute_daily_statistics(file_path):
 # Usage
 file_path = 'json/reddit-tracker/wallstreetbets/data.json'
 daily_statistics = compute_daily_statistics(file_path)
-print(json.dumps(daily_statistics, indent=2))
+save_data(daily_statistics)
--- a/app/cron_reddit_tracker.py
+++ b/app/cron_reddit_tracker.py
@ -1,18 +1,18 @@
-import requests
+import praw
 import json
 from datetime import datetime
 import os
 from dotenv import load_dotenv
 import time
 load_dotenv()
 client_key = os.getenv('REDDIT_API_KEY')
 client_secret = os.getenv('REDDIT_API_SECRET')
 user_agent = os.getenv('REDDIT_USER_AGENT')
 # URL of the Reddit API endpoint
 url = "https://www.reddit.com/r/wallstreetbets/new.json"
 # File path for the JSON data
 file_path = 'json/reddit-tracker/wallstreetbets/data.json'
 headers = {
    'User-Agent': 'python:myapp:v1.0 (by /u/realstocknear)'
 }
 # Ensure the directory exists
 os.makedirs(os.path.dirname(file_path), exist_ok=True)
@ -28,14 +28,12 @@ def save_data(data):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
-# Function to get updated post data
+# Initialize Reddit instance
-def get_updated_post_data(permalink):
+reddit = praw.Reddit(
-    post_url = f"https://www.reddit.com{permalink}.json"
+    client_id=client_key,
-    response = requests.get(post_url, headers=headers)
+    client_secret=client_secret,
-    if response.status_code == 200:
+    user_agent=user_agent
-        post_data = response.json()[0]['data']['children'][0]['data']
+)
        return post_data
    return None
 # Load existing data
 existing_data = load_existing_data()
@ -43,53 +41,42 @@ existing_data = load_existing_data()
 # Create a dictionary of existing posts for faster lookup and update
 existing_posts = {post['id']: post for post in existing_data}
-# Send a GET request to the API
+# Flag to check if any data was added or updated
-response = requests.get(url, headers=headers)
+data_changed = False
-counter = 0
+# Get the subreddit
-# Check if the request was successful
+subreddit = reddit.subreddit("wallstreetbets")
 if response.status_code == 200:
    # Parse the JSON data
    data = response.json()
    # Flag to check if any data was added or updated
    data_changed = False
    # Iterate through each post in the 'children' list
    for post in data['data']['children']:
        post_data = post['data']
        post_id = post_data.get('id', '')
 # Iterate through new submissions
 for submission in subreddit.new(limit=1000):
    post_id = submission.id
    # Check if this post is already in our data
    if post_id in existing_posts:
        # Update existing post
-            if counter < 25: #Only update the latest 25 posts to not overload the reddit server
+        existing_posts[post_id]['upvote_ratio'] = submission.upvote_ratio
-	            updated_data = get_updated_post_data(post_data['permalink'])
+        existing_posts[post_id]['num_comments'] = submission.num_comments
 	            if updated_data:
 	                existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio'])
 	                existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments'])
        data_changed = True
 	                counter +=1
 	                print(counter)
    else:
        # Extract the required fields for new post
        extracted_post = {
            "id": post_id,
-                "permalink": post_data.get('permalink', ''),
+            "permalink": submission.permalink,
-                "title": post_data.get('title', ''),
+            "title": submission.title,
-                "selftext": post_data.get('selftext', ''),
+            "selftext": submission.selftext,
-                "created_utc": post_data.get('created_utc', ''),
+            "created_utc": int(submission.created_utc),
-                "upvote_ratio": post_data.get('upvote_ratio', ''),
+            "upvote_ratio": submission.upvote_ratio,
-                "num_comments": post_data.get('num_comments', ''),
+            "num_comments": submission.num_comments,
-                "link_flair_text": post_data.get('link_flair_text', ''),
+            "link_flair_text": submission.link_flair_text,
-                "author": post_data.get('author', ''),
+            "author": str(submission.author),
        }
        # Add the new post to the existing data
        existing_posts[post_id] = extracted_post
        data_changed = True
-    if data_changed:
+    time.sleep(1)  # Add a 1-second delay between processing submissions
 if data_changed:
    # Convert the dictionary back to a list and sort by created_utc
    updated_data = list(existing_posts.values())
    updated_data.sort(key=lambda x: x['created_utc'], reverse=True)
@ -97,7 +84,5 @@ if response.status_code == 200:
    # Save the updated data
    save_data(updated_data)
    print(f"Data updated and saved to {file_path}")
    else:
        print("No new data to add or update.")
 else:
-    print(f"Failed to retrieve data. Status code: {response.status_code}")
+    print("No new data to add or update.")
--- a/requirements.txt
+++ b/requirements.txt
@ -37,3 +37,4 @@ finnhub-python
 intrinio_sdk
 openai
 slowapi
 praw