Spaces:
Build error
Build error
| # 1. Install and Import Baseline Dependencies | |
| from transformers import PegasusTokenizer, PegasusForConditionalGeneration | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import re | |
| from transformers import pipeline | |
| import csv | |
| import streamlit as st | |
| st.title('Stocks Analysis Machine') | |
| x = st.slider('Select a value') | |
| st.write(x, 'squared is', x * x) | |
| # 2. Setup Model | |
| model_name = "human-centered-summarization/financial-summarization-pegasus" | |
| tokenizer = PegasusTokenizer.from_pretrained(model_name) | |
| model = PegasusForConditionalGeneration.from_pretrained(model_name) | |
| # 3. Setup Pipeline | |
| monitored_tickers = ['ETH'] | |
| # 4.1. Search for Stock News using Google and Yahoo Finance | |
| print('Searching for stock news for', monitored_tickers) | |
| def search_for_stock_news_links(ticker): | |
| search_url = 'https://www.google.com/search?q=yahoo+finance+{}&tbm=nws'.format(ticker) | |
| r = requests.get(search_url) | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| atags = soup.find_all('a') | |
| hrefs = [link['href'] for link in atags] | |
| return hrefs | |
| raw_urls = {ticker:search_for_stock_news_links(ticker) for ticker in monitored_tickers} | |
| # 4.2. Strip out unwanted URLs | |
| print('Cleaning URLs.') | |
| exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support'] | |
| def strip_unwanted_urls(urls, exclude_list): | |
| val = [] | |
| for url in urls: | |
| if 'https://' in url and not any(exc in url for exc in exclude_list): | |
| res = re.findall(r'(https?://\S+)', url)[0].split('&')[0] | |
| val.append(res) | |
| return list(set(val)) | |
| cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker] , exclude_list) for ticker in monitored_tickers} | |
| # 4.3. Search and Scrape Cleaned URLs | |
| print('Scraping news links.') | |
| def scrape_and_process(URLs): | |
| ARTICLES = [] | |
| for url in URLs: | |
| r = requests.get(url) | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| results = soup.find_all('p') | |
| text = [res.text for res in results] | |
| words = ' '.join(text).split(' ')[:350] | |
| ARTICLE = ' '.join(words) | |
| ARTICLES.append(ARTICLE) | |
| return ARTICLES | |
| articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers} | |
| # 4.4. Summarise all Articles | |
| print('Summarizing articles.') | |
| def summarize(articles): | |
| summaries = [] | |
| for article in articles: | |
| input_ids = tokenizer.encode(article, return_tensors="pt") | |
| output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True) | |
| summary = tokenizer.decode(output[0], skip_special_tokens=True) | |
| summaries.append(summary) | |
| return summaries | |
| summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers} | |
| # 5. Adding Sentiment Analysis | |
| print('Calculating sentiment.') | |
| sentiment = pipeline("sentiment-analysis") | |
| scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers} | |
| # # 6. Exporting Results | |
| print('Exporting results') | |
| def create_output_array(summaries, scores, urls): | |
| output = [] | |
| for ticker in monitored_tickers: | |
| for counter in range(len(summaries[ticker])): | |
| output_this = [ | |
| ticker, | |
| summaries[ticker][counter], | |
| scores[ticker][counter]['label'], | |
| scores[ticker][counter]['score'], | |
| urls[ticker][counter] | |
| ] | |
| output.append(output_this) | |
| return output | |
| final_output = create_output_array(summaries, scores, cleaned_urls) | |
| final_output.insert(0, ['Ticker','Summary', 'Sentiment', 'Sentiment Score', 'URL']) | |
| with open('ethsummaries.csv', mode='w', newline='') as f: | |
| csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
| csv_writer.writerows(final_output) |