bugfixing
This commit is contained in:
parent
629e127ea0
commit
7f56ca4daa
@ -1,6 +1,6 @@
|
||||
import praw
|
||||
import orjson
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import time
|
||||
@ -10,23 +10,33 @@ client_key = os.getenv('REDDIT_API_KEY')
|
||||
client_secret = os.getenv('REDDIT_API_SECRET')
|
||||
user_agent = os.getenv('REDDIT_USER_AGENT')
|
||||
|
||||
|
||||
run_all = False
|
||||
|
||||
# File path for the JSON data
|
||||
file_path = 'json/reddit-tracker/wallstreetbets/data.json'
|
||||
|
||||
# Ensure the directory exists
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
|
||||
# Function to load existing data
|
||||
def load_existing_data():
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return orjson.loads(file.read())
|
||||
try:
|
||||
with open(file_path, 'rb') as file:
|
||||
return orjson.loads(file.read())
|
||||
except:
|
||||
return []
|
||||
return []
|
||||
|
||||
# Function to save data
|
||||
def save_data(data):
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
file.write(orjson.dumps(data, f, ensure_ascii=False, indent=4).decode("utf-8"))
|
||||
with open(file_path, 'wb') as file:
|
||||
file.write(orjson.dumps(data))
|
||||
|
||||
def is_within_last_N_days(timestamp):
|
||||
current_time = datetime.now()
|
||||
post_time = datetime.fromtimestamp(timestamp)
|
||||
three_months_ago = current_time - timedelta(days=180)
|
||||
return post_time >= three_months_ago
|
||||
|
||||
# Initialize Reddit instance
|
||||
reddit = praw.Reddit(
|
||||
@ -38,13 +48,8 @@ reddit = praw.Reddit(
|
||||
# Load existing data
|
||||
existing_data = load_existing_data()
|
||||
existing_data = [
|
||||
{**item, 'upvote_ratio': round(item['upvote_ratio'] * 100,2) if item['upvote_ratio'] < 1 else item['upvote_ratio']}
|
||||
for item in existing_data
|
||||
if item['num_comments'] >= 50
|
||||
]
|
||||
|
||||
|
||||
|
||||
{**item, 'upvote_ratio': round(item['upvote_ratio'] * 100, 2) if item['upvote_ratio'] < 1 else item['upvote_ratio']}
|
||||
for item in existing_data if item['num_comments'] >= 50]
|
||||
|
||||
# Create a dictionary of existing posts for faster lookup and update
|
||||
existing_posts = {post['id']: post for post in existing_data}
|
||||
@ -55,59 +60,95 @@ data_changed = False
|
||||
# Get the subreddit
|
||||
subreddit = reddit.subreddit("wallstreetbets")
|
||||
|
||||
# Iterate through new submissions
|
||||
for submission in subreddit.hot(limit=1000):
|
||||
post_id = submission.id
|
||||
|
||||
# Check if the post was deleted by moderators
|
||||
if submission.removed_by_category == "mod":
|
||||
# Remove post from existing data if it was deleted by moderators
|
||||
if post_id in existing_posts:
|
||||
del existing_posts[post_id]
|
||||
data_changed = True
|
||||
print('deleted')
|
||||
continue # Skip this post
|
||||
# Different methods to get posts
|
||||
|
||||
if submission.num_comments < 50:
|
||||
# Remove post from existing data if it was deleted by moderators
|
||||
if post_id in existing_posts:
|
||||
del existing_posts[post_id]
|
||||
data_changed = True
|
||||
print('deleted')
|
||||
continue # Skip this post
|
||||
#Run once
|
||||
if run_all == True:
|
||||
methods = [
|
||||
subreddit.hot(limit=5000),
|
||||
subreddit.new(limit=5000),
|
||||
subreddit.top(time_filter='month', limit=5000),
|
||||
subreddit.top(time_filter='week', limit=5000),
|
||||
subreddit.top(time_filter='year', limit=5000),
|
||||
]
|
||||
else:
|
||||
methods = [
|
||||
subreddit.hot(limit=1000),
|
||||
subreddit.new(limit=1000),
|
||||
]
|
||||
|
||||
# Check if this post is already in our data
|
||||
if post_id in existing_posts:
|
||||
# Update existing post
|
||||
existing_posts[post_id]['upvote_ratio'] = round(submission.upvote_ratio * 100, 2)
|
||||
existing_posts[post_id]['num_comments'] = submission.num_comments
|
||||
data_changed = True
|
||||
else:
|
||||
if submission.num_comments < 50:
|
||||
continue # Skip this post
|
||||
|
||||
# Try to get a high-quality thumbnail URL
|
||||
thumbnail = None
|
||||
if hasattr(submission, 'preview'):
|
||||
thumbnail = submission.preview['images'][0]['source']['url']
|
||||
|
||||
# Extract the required fields for new post
|
||||
extracted_post = {
|
||||
"id": post_id,
|
||||
"permalink": submission.permalink,
|
||||
"title": submission.title,
|
||||
"thumbnail": thumbnail,
|
||||
"selftext": submission.selftext,
|
||||
"created_utc": int(submission.created_utc),
|
||||
"upvote_ratio": round(submission.upvote_ratio * 100, 2),
|
||||
"num_comments": submission.num_comments,
|
||||
"link_flair_text": submission.link_flair_text,
|
||||
"author": str(submission.author),
|
||||
}
|
||||
|
||||
# Add the new post to the existing data
|
||||
existing_posts[post_id] = extracted_post
|
||||
data_changed = True
|
||||
processed_ids = set()
|
||||
|
||||
for submission_stream in methods:
|
||||
try:
|
||||
for submission in submission_stream:
|
||||
post_id = submission.id
|
||||
|
||||
# Skip if we've already processed this post
|
||||
if post_id in processed_ids:
|
||||
continue
|
||||
|
||||
processed_ids.add(post_id)
|
||||
|
||||
# Check if the post is within the last 3 months
|
||||
if not is_within_last_N_days(submission.created_utc):
|
||||
continue
|
||||
|
||||
# Check if the post was deleted by moderators
|
||||
if submission.removed_by_category == "mod":
|
||||
if post_id in existing_posts:
|
||||
del existing_posts[post_id]
|
||||
data_changed = True
|
||||
print(f'Deleted post: {post_id}')
|
||||
continue
|
||||
|
||||
if submission.num_comments < 50:
|
||||
if post_id in existing_posts:
|
||||
del existing_posts[post_id]
|
||||
data_changed = True
|
||||
print(f'Removed low-comment post: {post_id}')
|
||||
continue
|
||||
|
||||
# Check if this post is already in our data
|
||||
if post_id in existing_posts:
|
||||
# Update existing post
|
||||
existing_posts[post_id]['upvote_ratio'] = round(submission.upvote_ratio * 100, 2)
|
||||
existing_posts[post_id]['num_comments'] = submission.num_comments
|
||||
data_changed = True
|
||||
else:
|
||||
# Try to get a high-quality thumbnail URL
|
||||
thumbnail = None
|
||||
if hasattr(submission, 'preview'):
|
||||
try:
|
||||
thumbnail = submission.preview['images'][0]['source']['url']
|
||||
except (KeyError, IndexError):
|
||||
pass
|
||||
|
||||
# Extract the required fields for new post
|
||||
extracted_post = {
|
||||
"id": post_id,
|
||||
"permalink": submission.permalink,
|
||||
"title": submission.title,
|
||||
"thumbnail": thumbnail,
|
||||
"selftext": submission.selftext,
|
||||
"created_utc": int(submission.created_utc),
|
||||
"upvote_ratio": round(submission.upvote_ratio * 100, 2),
|
||||
"num_comments": submission.num_comments,
|
||||
"link_flair_text": submission.link_flair_text,
|
||||
"author": str(submission.author),
|
||||
}
|
||||
|
||||
# Add the new post to the existing data
|
||||
existing_posts[post_id] = extracted_post
|
||||
data_changed = True
|
||||
print(f'Added new post: {post_id}')
|
||||
|
||||
# Sleep briefly to avoid hitting rate limits
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing submission stream: {e}")
|
||||
continue
|
||||
|
||||
if data_changed:
|
||||
# Convert the dictionary back to a list and sort by created_utc
|
||||
@ -117,5 +158,6 @@ if data_changed:
|
||||
# Save the updated data
|
||||
save_data(updated_data)
|
||||
print(f"Data updated and saved to {file_path}")
|
||||
print(f"Total posts in database: {len(updated_data)}")
|
||||
else:
|
||||
print("No new data to add or update.")
|
||||
print("No new data to add or update.")
|
||||
Loading…
x
Reference in New Issue
Block a user