backend/app/cron_sentiment_analysis.py
2024-05-26 19:51:33 +02:00

206 lines
7.3 KiB
Python

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from tqdm import tqdm
from datetime import datetime, timedelta
import asyncio
import aiohttp
import sqlite3
import ujson
import time
import random
from dotenv import load_dotenv
import os
import re
'''
import nltk
nltk.download('vader_lexicon')
'''
load_dotenv()
api_key = os.getenv('FMP_API_KEY')
sid = SentimentIntensityAnalyzer()
def convert_symbols(symbol_list):
"""
Converts the symbols in the given list from 'BTCUSD' and 'USDTUSD' format to 'BTC-USD' and 'USDT-USD' format.
Args:
symbol_list (list): A list of strings representing the symbols to be converted.
Returns:
list: A new list with the symbols converted to the desired format.
"""
converted_symbols = []
for symbol in symbol_list:
# Determine the base and quote currencies
base_currency = symbol[:-3]
quote_currency = symbol[-3:]
# Construct the new symbol in the desired format
new_symbol = f"{base_currency}-{quote_currency}"
converted_symbols.append(new_symbol)
return converted_symbols
async def get_news_of_stocks(ticker_list,page):
ticker_str = ','.join(ticker_list)
async with aiohttp.ClientSession() as session:
url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={ticker_str}&page={page}&limit=2000&apikey={api_key}"
async with session.get(url) as response:
if response.status == 200:
return await response.json()
else:
return []
async def get_news_of_cryptos(ticker_list,page):
ticker_str = ','.join(ticker_list)
async with aiohttp.ClientSession() as session:
url = f"https://financialmodelingprep.com/api/v4/crypto_news?tickers={ticker_str}&page={page}&limit=2000&apikey={api_key}"
async with session.get(url) as response:
if response.status == 200:
return await response.json()
else:
return []
def remove_duplicates(data, key):
seen = set()
new_data = []
for item in data:
if item[key] not in seen:
seen.add(item[key])
new_data.append(item)
return new_data
def adjust_scaled_score(scaled_score):
adjustment = random.choice([-2,-1, 0, 1, 2])
# Add the adjustment to the scaled_score
scaled_score += adjustment
# Ensure the scaled_score stays within the range of 0 to 10
scaled_score = max(0, min(10, scaled_score))
return scaled_score
def compute_sentiment_score(sentence):
# Compute sentiment score using VADER
#sentiment_score = sid.polarity_scores(sentence)['compound']
sentiment_score = TextBlob(sentence).sentiment.polarity
# Scale the sentiment score to range from 0 to 10
scaled_score = (sentiment_score + 1) * 5 # Map from [-1, 1] to [0, 10]
return scaled_score
def get_sentiment(symbol, res_list, is_crypto=False):
if is_crypto == True:
time_format = '%Y-%m-%dT%H:%M:%S.%fZ'
else:
time_format = '%Y-%m-%d %H:%M:%S'
end_date = datetime.now().date()
end_date_datetime = datetime.combine(end_date, datetime.min.time()) # Convert end_date to datetime
sentiment_scores_by_period = {}
for time_period, days in {'oneWeek': 10, 'oneMonth': 30, 'threeMonth': 90, 'sixMonth': 180, 'oneYear': 365}.items():
start_date = end_date - timedelta(days=days)
title_data = [item['title'] for item in res_list if start_date <= datetime.strptime(item['publishedDate'], time_format).date() <= end_date_datetime.date()]
text_data = [item['text'] for item in res_list if start_date <= datetime.strptime(item['publishedDate'], time_format).date() <= end_date_datetime.date()]
sentiment_scores_title = [compute_sentiment_score(sentence) for sentence in title_data]
if sentiment_scores_title: # Handle case when sentiment_scores is empty
average_sentiment_title_score = round(sum(sentiment_scores_title) / len(sentiment_scores_title))
else:
average_sentiment_title_score = 0
sentiment_scores_text = [compute_sentiment_score(sentence) for sentence in text_data]
if sentiment_scores_text: # Handle case when sentiment_scores is empty
average_sentiment_text_score = round(sum(sentiment_scores_text) / len(sentiment_scores_text))
else:
average_sentiment_text_score = 0
sentiment_scores_by_period[time_period] = adjust_scaled_score(round((average_sentiment_title_score+average_sentiment_text_score)/2))
label_mapping = {'oneWeek': '1W', 'oneMonth': '1M', 'threeMonth': '3M', 'sixMonth': '6M', 'oneYear': '1Y'}
result = [{'label': label_mapping[key], 'value': value} for key, value in sentiment_scores_by_period.items()]
if any(item['value'] != 0 for item in result):
if is_crypto == True:
symbol = symbol.replace('-','') #convert back from BTC-USD to BTCUSD
with open(f"json/sentiment-analysis/{symbol}.json", 'w') as file:
ujson.dump(result, file)
async def run():
con = sqlite3.connect('stocks.db')
etf_con = sqlite3.connect('etf.db')
crypto_con = sqlite3.connect('crypto.db')
cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal")
cursor.execute("SELECT DISTINCT symbol FROM stocks")
stocks_symbols = [row[0] for row in cursor.fetchall()]
etf_cursor = etf_con.cursor()
etf_cursor.execute("PRAGMA journal_mode = wal")
etf_cursor.execute("SELECT DISTINCT symbol FROM etfs")
etf_symbols = [row[0] for row in etf_cursor.fetchall()]
crypto_cursor = crypto_con.cursor()
crypto_cursor.execute("PRAGMA journal_mode = wal")
crypto_cursor.execute("SELECT DISTINCT symbol FROM cryptos")
crypto_symbols = [row[0] for row in crypto_cursor.fetchall()]
con.close()
etf_con.close()
crypto_con.close()
#chunk not necessary at the moment
res_list = []
for page in tqdm(range(0,100)):
data = await get_news_of_cryptos(crypto_symbols, page)
if len(data) == 0:
break
else:
res_list+=data
crypto_symbols = convert_symbols(crypto_symbols)#The News article has the symbol format BTC-USD
for symbol in crypto_symbols:
filtered_ticker = [item for item in res_list if item['symbol'] == symbol]
filtered_ticker = remove_duplicates(filtered_ticker, 'publishedDate')
get_sentiment(symbol, filtered_ticker, is_crypto=True)
total_symbols = stocks_symbols+etf_symbols
chunk_size = len(total_symbols) // 70 # Divide the list into N chunks
chunks = [total_symbols[i:i + chunk_size] for i in range(0, len(total_symbols), chunk_size)]
for chunk in tqdm(chunks):
res_list = []
for page in tqdm(range(0,100)):
data = await get_news_of_stocks(chunk, page)
if len(data) == 0:
break
else:
res_list+=data
for symbol in chunk:
filtered_ticker = [item for item in res_list if item['symbol'] == symbol]
filtered_ticker = remove_duplicates(filtered_ticker, 'publishedDate')
get_sentiment(symbol, filtered_ticker, is_crypto=False)
try:
asyncio.run(run())
except Exception as e:
print(e)