Spaces:
Running
Running
| # This script defines functions that search the corpus for blocks that are similar to the query. | |
| # Loading embeddings of the query had to be changed for deployment in production because | |
| # my CSVs took too much space for the free tier of HuggingFace spaces. | |
| import polars as pl | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from huggingface_hub import hf_hub_download | |
| import numpy as np | |
| from numpy.typing import NDArray | |
| from joblib import load | |
| import scipy | |
| import fasttext | |
| from collections.abc import Callable | |
| def query_worker(query: str, rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame: | |
| """ | |
| Calculate the cosine similarity of the query to each block of text from the corpus. | |
| Parameters: | |
| query (str): Search query | |
| fasttext_model (fasttext.FastText._FastText): | |
| idf (numpy.ndarray): | |
| dtm_svd (numpy.ndarray): | |
| dtm_svd_mat (numpy.ndarray): | |
| vocab_norm (numpy.ndarray): | |
| concentration (float): | |
| Returns: | |
| polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first. | |
| """ | |
| # query embeddings: | |
| query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()]) | |
| # Normalize rows | |
| query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True) | |
| # Compute cosine similarity matrix | |
| query_similarities = np.dot(query_norm, vocab_norm.T) | |
| query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1) | |
| query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0) | |
| # calculate the average TF-IDF score of the query over topics: | |
| mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1) | |
| sorted_df = pl.DataFrame( | |
| { | |
| 'score-tfidf': mean_query_score, | |
| 'file': rownames | |
| }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))])) | |
| #return the sorted results | |
| return(sorted_df) | |
| def query_factory(rownames: list[str], fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]: | |
| """ | |
| Create a function that will compare query text to the documents in the corpus. | |
| Parameters: | |
| dtm_svd (np.ndarray): | |
| """ | |
| def do_query(query: str) -> pl.DataFrame: | |
| """ | |
| Call the worker that compares the query term distribution to the documents in the corpus | |
| Parameters: | |
| query (str): Text to compare to the documents | |
| Returns: | |
| polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first. | |
| """ | |
| return query_worker(query, rownames, fasttext_model, idf, dtm_svd, dtm_svd_mat, vocab_norm, concentration) | |
| return do_query | |
| def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]: | |
| """ | |
| Create a function that compares the word distribution in a query to each document in the corpus. | |
| Parameters: | |
| dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format. | |
| vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`. | |
| model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)" | |
| Returns: | |
| callable: Function that compares the query string to the corpus. | |
| """ | |
| # load the fasttext model | |
| fasttext_model = fasttext.load_model(hf_hub_download(model_name, "model.bin")) | |
| # load the TF-IDF and DTM | |
| my_df = pl.read_parquet(dtm_df_path) | |
| my_vectorizer = load(vectorizer_path) | |
| # vocab embeddings: | |
| my_vocabulary = my_vectorizer.get_feature_names_out() | |
| vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary]) | |
| keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])] | |
| # drop terms that have no embeddings in the fasttext model: | |
| vocab_embeddings = vocab_embeddings[keep_terms, :] | |
| my_vocabulary = my_vocabulary[keep_terms] | |
| # get just IDF document-term matrix of the corpus: | |
| my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0])) | |
| # calculate length of each embedding vector | |
| vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True) | |
| # get the document-term matrix and project it to 300 pseudo-topics. | |
| filenames = my_df["file"].to_list() | |
| doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms] | |
| dtm_svd = TruncatedSVD(n_components=300) | |
| X_svd = dtm_svd.fit_transform(doc_term_mat) | |
| return query_factory(rownames = filenames, fasttext_model = fasttext_model, idf = my_idf, dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30) | |