From 7f56ca4daaa2d816402f15ca7808eb7c336dda90 Mon Sep 17 00:00:00 2001 From: MuslemRahimi Date: Sat, 23 Nov 2024 14:02:51 +0100 Subject: [PATCH] bugfixing --- app/cron_reddit_tracker.py | 174 +++++++++++++++++++++++-------------- 1 file changed, 108 insertions(+), 66 deletions(-) diff --git a/app/cron_reddit_tracker.py b/app/cron_reddit_tracker.py index 439817e..0225c6f 100644 --- a/app/cron_reddit_tracker.py +++ b/app/cron_reddit_tracker.py @@ -1,6 +1,6 @@ import praw import orjson -from datetime import datetime +from datetime import datetime, timedelta import os from dotenv import load_dotenv import time @@ -10,23 +10,33 @@ client_key = os.getenv('REDDIT_API_KEY') client_secret = os.getenv('REDDIT_API_SECRET') user_agent = os.getenv('REDDIT_USER_AGENT') + +run_all = False + # File path for the JSON data file_path = 'json/reddit-tracker/wallstreetbets/data.json' # Ensure the directory exists os.makedirs(os.path.dirname(file_path), exist_ok=True) -# Function to load existing data def load_existing_data(): if os.path.exists(file_path): - with open(file_path, 'r', encoding='utf-8') as file: - return orjson.loads(file.read()) + try: + with open(file_path, 'rb') as file: + return orjson.loads(file.read()) + except: + return [] return [] -# Function to save data def save_data(data): - with open(file_path, 'w', encoding='utf-8') as f: - file.write(orjson.dumps(data, f, ensure_ascii=False, indent=4).decode("utf-8")) + with open(file_path, 'wb') as file: + file.write(orjson.dumps(data)) + +def is_within_last_N_days(timestamp): + current_time = datetime.now() + post_time = datetime.fromtimestamp(timestamp) + three_months_ago = current_time - timedelta(days=180) + return post_time >= three_months_ago # Initialize Reddit instance reddit = praw.Reddit( @@ -38,13 +48,8 @@ reddit = praw.Reddit( # Load existing data existing_data = load_existing_data() existing_data = [ - {**item, 'upvote_ratio': round(item['upvote_ratio'] * 100,2) if item['upvote_ratio'] < 1 else item['upvote_ratio']} - for item in existing_data - if item['num_comments'] >= 50 -] - - - + {**item, 'upvote_ratio': round(item['upvote_ratio'] * 100, 2) if item['upvote_ratio'] < 1 else item['upvote_ratio']} + for item in existing_data if item['num_comments'] >= 50] # Create a dictionary of existing posts for faster lookup and update existing_posts = {post['id']: post for post in existing_data} @@ -55,59 +60,95 @@ data_changed = False # Get the subreddit subreddit = reddit.subreddit("wallstreetbets") -# Iterate through new submissions -for submission in subreddit.hot(limit=1000): - post_id = submission.id - - # Check if the post was deleted by moderators - if submission.removed_by_category == "mod": - # Remove post from existing data if it was deleted by moderators - if post_id in existing_posts: - del existing_posts[post_id] - data_changed = True - print('deleted') - continue # Skip this post +# Different methods to get posts - if submission.num_comments < 50: - # Remove post from existing data if it was deleted by moderators - if post_id in existing_posts: - del existing_posts[post_id] - data_changed = True - print('deleted') - continue # Skip this post +#Run once +if run_all == True: + methods = [ + subreddit.hot(limit=5000), + subreddit.new(limit=5000), + subreddit.top(time_filter='month', limit=5000), + subreddit.top(time_filter='week', limit=5000), + subreddit.top(time_filter='year', limit=5000), + ] +else: + methods = [ + subreddit.hot(limit=1000), + subreddit.new(limit=1000), + ] - # Check if this post is already in our data - if post_id in existing_posts: - # Update existing post - existing_posts[post_id]['upvote_ratio'] = round(submission.upvote_ratio * 100, 2) - existing_posts[post_id]['num_comments'] = submission.num_comments - data_changed = True - else: - if submission.num_comments < 50: - continue # Skip this post - - # Try to get a high-quality thumbnail URL - thumbnail = None - if hasattr(submission, 'preview'): - thumbnail = submission.preview['images'][0]['source']['url'] - - # Extract the required fields for new post - extracted_post = { - "id": post_id, - "permalink": submission.permalink, - "title": submission.title, - "thumbnail": thumbnail, - "selftext": submission.selftext, - "created_utc": int(submission.created_utc), - "upvote_ratio": round(submission.upvote_ratio * 100, 2), - "num_comments": submission.num_comments, - "link_flair_text": submission.link_flair_text, - "author": str(submission.author), - } - - # Add the new post to the existing data - existing_posts[post_id] = extracted_post - data_changed = True +processed_ids = set() + +for submission_stream in methods: + try: + for submission in submission_stream: + post_id = submission.id + + # Skip if we've already processed this post + if post_id in processed_ids: + continue + + processed_ids.add(post_id) + + # Check if the post is within the last 3 months + if not is_within_last_N_days(submission.created_utc): + continue + + # Check if the post was deleted by moderators + if submission.removed_by_category == "mod": + if post_id in existing_posts: + del existing_posts[post_id] + data_changed = True + print(f'Deleted post: {post_id}') + continue + + if submission.num_comments < 50: + if post_id in existing_posts: + del existing_posts[post_id] + data_changed = True + print(f'Removed low-comment post: {post_id}') + continue + + # Check if this post is already in our data + if post_id in existing_posts: + # Update existing post + existing_posts[post_id]['upvote_ratio'] = round(submission.upvote_ratio * 100, 2) + existing_posts[post_id]['num_comments'] = submission.num_comments + data_changed = True + else: + # Try to get a high-quality thumbnail URL + thumbnail = None + if hasattr(submission, 'preview'): + try: + thumbnail = submission.preview['images'][0]['source']['url'] + except (KeyError, IndexError): + pass + + # Extract the required fields for new post + extracted_post = { + "id": post_id, + "permalink": submission.permalink, + "title": submission.title, + "thumbnail": thumbnail, + "selftext": submission.selftext, + "created_utc": int(submission.created_utc), + "upvote_ratio": round(submission.upvote_ratio * 100, 2), + "num_comments": submission.num_comments, + "link_flair_text": submission.link_flair_text, + "author": str(submission.author), + } + + # Add the new post to the existing data + existing_posts[post_id] = extracted_post + data_changed = True + print(f'Added new post: {post_id}') + + # Sleep briefly to avoid hitting rate limits + time.sleep(0.1) + + except Exception as e: + print(f"Error processing submission stream: {e}") + continue if data_changed: # Convert the dictionary back to a list and sort by created_utc @@ -117,5 +158,6 @@ if data_changed: # Save the updated data save_data(updated_data) print(f"Data updated and saved to {file_path}") + print(f"Total posts in database: {len(updated_data)}") else: - print("No new data to add or update.") + print("No new data to add or update.") \ No newline at end of file