Spaces:
Runtime error
Runtime error
| import re | |
| import nltk | |
| from typing import List | |
| from transformers import pipeline | |
| from tqdm import tqdm | |
| import numpy as np | |
| import numpy as np | |
| import scipy | |
| from transformers import AutoModelForSequenceClassification | |
| from transformers import TFAutoModelForSequenceClassification | |
| from transformers import AutoTokenizer | |
| from scipy.special import softmax | |
| import os | |
| def tweet_cleaner(tweet: str) -> str: | |
| # words = set(nltk.corpus.words.words()) | |
| """ | |
| Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words. | |
| Args: | |
| tweet (str): A single tweet as a string. | |
| Returns: | |
| str: The cleaned tweet. | |
| """ | |
| if not isinstance(tweet, str): | |
| try: | |
| tweet = str(tweet) | |
| except Exception as e: | |
| print(f"Error converting tweet to string: {e}") | |
| return tweet | |
| bad_start = ["http:", "https:"] | |
| for w in bad_start: | |
| tweet = re.sub(f" {w}\\S+", "", tweet) # remove white space before url | |
| tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url | |
| tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line | |
| tweet = re.sub( | |
| f"\n{w}\\S+", "", tweet | |
| ) # in case the url is alone on a new line | |
| tweet = re.sub(f"{w}\\S+", "", tweet) # any other case? | |
| tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space | |
| return " ".join(tweet.split()).strip() | |
| def is_boring_tweet(tweet): | |
| """Check if tweet is boring.""" | |
| boring_stuff = ["http", "@", "#"] | |
| not_boring_words = sum( | |
| 1 | |
| for word in tweet.split() | |
| if not any(bs in word.lower() for bs in boring_stuff) | |
| ) | |
| return not_boring_words < 3 | |
| def fix_text(text): | |
| text = text.replace("&", "&") | |
| text = text.replace("<", "<") | |
| text = text.replace(">", ">") | |
| return text | |
| def twitter_sentiment_api_score( | |
| tweet_list: list = None, return_argmax: bool = True, use_api=False | |
| ): | |
| """ | |
| Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet. | |
| Args: | |
| tweet_list (list): A list of strings, where each string represents a tweet. | |
| return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet. | |
| Returns: | |
| A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary | |
| contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that | |
| represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If | |
| `return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for | |
| that tweet. | |
| """ | |
| if use_api: | |
| import requests | |
| # URL and authentication header for the Hugging Face Twitter Sentiment Analysis API | |
| API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment" | |
| headers = {"Authorization": "Bearer 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"} | |
| # Function to send a POST request with a JSON payload to the API and return the response as a JSON object | |
| def query(payload): | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| return response.json() | |
| # Send a list of tweets to the API and receive a list of sentiment scores for each tweet | |
| output = query( | |
| { | |
| "inputs": tweet_list, | |
| } | |
| ) | |
| else: | |
| task = "sentiment" | |
| MODEL = f"cardiffnlp/twitter-roberta-base-{task}" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL) | |
| # model.save_pretrained(MODEL) | |
| def get_sentimet(text): | |
| labels = ["negative", "neutral", "positive"] | |
| # text = "Good night 😊" | |
| text = tweet_cleaner(text) | |
| encoded_input = tokenizer(text, return_tensors="pt") | |
| output = model(**encoded_input) | |
| scores = output[0][0].detach().numpy() | |
| scores = softmax(scores) | |
| ranking = np.argsort(scores)[::-1] | |
| results = { | |
| labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4) | |
| for i in range(scores.shape[0]) | |
| } | |
| max_key = max(results, key=results.get) | |
| results["argmax"] = max_key | |
| return results | |
| return [get_sentimet(t) for t in tqdm(tweet_list)] | |
| # Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels | |
| result = [] | |
| for s in output: | |
| sentiment_dict = {} | |
| for d in s: | |
| if isinstance(d, dict): | |
| if d["label"] == "LABEL_2": | |
| sentiment_dict["positive"] = d["score"] | |
| elif d["label"] == "LABEL_1": | |
| sentiment_dict["neutral"] = d["score"] | |
| elif d["label"] == "LABEL_0": | |
| sentiment_dict["negative"] = d["score"] | |
| if return_argmax and len(sentiment_dict) > 0: | |
| argmax_label = max(sentiment_dict, key=sentiment_dict.get) | |
| sentiment_dict["argmax"] = argmax_label | |
| result.append(sentiment_dict) | |
| # Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet | |
| return result | |