bugfixing

This commit is contained in:
MuslemRahimi 2024-11-23 14:02:51 +01:00
parent 629e127ea0
commit 7f56ca4daa

View File

@ -1,6 +1,6 @@
import praw import praw
import orjson import orjson
from datetime import datetime from datetime import datetime, timedelta
import os import os
from dotenv import load_dotenv from dotenv import load_dotenv
import time import time
@ -10,23 +10,33 @@ client_key = os.getenv('REDDIT_API_KEY')
client_secret = os.getenv('REDDIT_API_SECRET') client_secret = os.getenv('REDDIT_API_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT') user_agent = os.getenv('REDDIT_USER_AGENT')
run_all = False
# File path for the JSON data # File path for the JSON data
file_path = 'json/reddit-tracker/wallstreetbets/data.json' file_path = 'json/reddit-tracker/wallstreetbets/data.json'
# Ensure the directory exists # Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True) os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Function to load existing data
def load_existing_data(): def load_existing_data():
if os.path.exists(file_path): if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as file: try:
with open(file_path, 'rb') as file:
return orjson.loads(file.read()) return orjson.loads(file.read())
except:
return []
return [] return []
# Function to save data
def save_data(data): def save_data(data):
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, 'wb') as file:
file.write(orjson.dumps(data, f, ensure_ascii=False, indent=4).decode("utf-8")) file.write(orjson.dumps(data))
def is_within_last_N_days(timestamp):
current_time = datetime.now()
post_time = datetime.fromtimestamp(timestamp)
three_months_ago = current_time - timedelta(days=180)
return post_time >= three_months_ago
# Initialize Reddit instance # Initialize Reddit instance
reddit = praw.Reddit( reddit = praw.Reddit(
@ -38,13 +48,8 @@ reddit = praw.Reddit(
# Load existing data # Load existing data
existing_data = load_existing_data() existing_data = load_existing_data()
existing_data = [ existing_data = [
{**item, 'upvote_ratio': round(item['upvote_ratio'] * 100,2) if item['upvote_ratio'] < 1 else item['upvote_ratio']} {**item, 'upvote_ratio': round(item['upvote_ratio'] * 100, 2) if item['upvote_ratio'] < 1 else item['upvote_ratio']}
for item in existing_data for item in existing_data if item['num_comments'] >= 50]
if item['num_comments'] >= 50
]
# Create a dictionary of existing posts for faster lookup and update # Create a dictionary of existing posts for faster lookup and update
existing_posts = {post['id']: post for post in existing_data} existing_posts = {post['id']: post for post in existing_data}
@ -55,26 +60,54 @@ data_changed = False
# Get the subreddit # Get the subreddit
subreddit = reddit.subreddit("wallstreetbets") subreddit = reddit.subreddit("wallstreetbets")
# Iterate through new submissions # Different methods to get posts
for submission in subreddit.hot(limit=1000):
#Run once
if run_all == True:
methods = [
subreddit.hot(limit=5000),
subreddit.new(limit=5000),
subreddit.top(time_filter='month', limit=5000),
subreddit.top(time_filter='week', limit=5000),
subreddit.top(time_filter='year', limit=5000),
]
else:
methods = [
subreddit.hot(limit=1000),
subreddit.new(limit=1000),
]
processed_ids = set()
for submission_stream in methods:
try:
for submission in submission_stream:
post_id = submission.id post_id = submission.id
# Skip if we've already processed this post
if post_id in processed_ids:
continue
processed_ids.add(post_id)
# Check if the post is within the last 3 months
if not is_within_last_N_days(submission.created_utc):
continue
# Check if the post was deleted by moderators # Check if the post was deleted by moderators
if submission.removed_by_category == "mod": if submission.removed_by_category == "mod":
# Remove post from existing data if it was deleted by moderators
if post_id in existing_posts: if post_id in existing_posts:
del existing_posts[post_id] del existing_posts[post_id]
data_changed = True data_changed = True
print('deleted') print(f'Deleted post: {post_id}')
continue # Skip this post continue
if submission.num_comments < 50: if submission.num_comments < 50:
# Remove post from existing data if it was deleted by moderators
if post_id in existing_posts: if post_id in existing_posts:
del existing_posts[post_id] del existing_posts[post_id]
data_changed = True data_changed = True
print('deleted') print(f'Removed low-comment post: {post_id}')
continue # Skip this post continue
# Check if this post is already in our data # Check if this post is already in our data
if post_id in existing_posts: if post_id in existing_posts:
@ -83,13 +116,13 @@ for submission in subreddit.hot(limit=1000):
existing_posts[post_id]['num_comments'] = submission.num_comments existing_posts[post_id]['num_comments'] = submission.num_comments
data_changed = True data_changed = True
else: else:
if submission.num_comments < 50:
continue # Skip this post
# Try to get a high-quality thumbnail URL # Try to get a high-quality thumbnail URL
thumbnail = None thumbnail = None
if hasattr(submission, 'preview'): if hasattr(submission, 'preview'):
try:
thumbnail = submission.preview['images'][0]['source']['url'] thumbnail = submission.preview['images'][0]['source']['url']
except (KeyError, IndexError):
pass
# Extract the required fields for new post # Extract the required fields for new post
extracted_post = { extracted_post = {
@ -108,6 +141,14 @@ for submission in subreddit.hot(limit=1000):
# Add the new post to the existing data # Add the new post to the existing data
existing_posts[post_id] = extracted_post existing_posts[post_id] = extracted_post
data_changed = True data_changed = True
print(f'Added new post: {post_id}')
# Sleep briefly to avoid hitting rate limits
time.sleep(0.1)
except Exception as e:
print(f"Error processing submission stream: {e}")
continue
if data_changed: if data_changed:
# Convert the dictionary back to a list and sort by created_utc # Convert the dictionary back to a list and sort by created_utc
@ -117,5 +158,6 @@ if data_changed:
# Save the updated data # Save the updated data
save_data(updated_data) save_data(updated_data)
print(f"Data updated and saved to {file_path}") print(f"Data updated and saved to {file_path}")
print(f"Total posts in database: {len(updated_data)}")
else: else:
print("No new data to add or update.") print("No new data to add or update.")