Spaces:
Running
Running
| # This script defines functions that search the corpus for blocks that are similar to the query. | |
| # Loading embeddings of the query had to be changed for deployment in production because | |
| # my CSVs took too much space for the free tier of HuggingFace spaces. | |
| # import packages | |
| import numpy as np | |
| import polars as pl | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| import glob | |
| from collections.abc import Callable | |
| import os | |
| def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame: | |
| """ | |
| Calculate the cosine similarity of the query to each block of text from the corpus. | |
| Parameters: | |
| query (str): Text of the query to search for in the documents. | |
| corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). | |
| model (sentence_transformers.SentenceTransformer): The model used to encode the sentences. | |
| Returns: | |
| polars.DataFrame: Corpus documents ranked by their match to the query. | |
| """ | |
| query_embeddings = np.reshape(model.encode(query), shape = (1, -1)) | |
| sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx']))) | |
| sorted_df = pl.DataFrame( | |
| { | |
| 'score': np.reshape(sbert_scores, shape=-1), | |
| 'file': corpus_embeddings_df['file'], | |
| 'doc_block_indx': corpus_embeddings_df['doc_block_indx'] | |
| }).group_by("file").agg(pl.col("score").max()) | |
| # sort the results and return | |
| return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])) | |
| def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]: | |
| """ | |
| Create a function that compares query text to the corpus by matching vector space embeddings. | |
| Parameters: | |
| corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). | |
| model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings. | |
| Returns: | |
| Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. | |
| """ | |
| def do_sbert_query(query: str) -> pl.DataFrame: | |
| """ | |
| Compare the query to the corpus. | |
| Parameters: | |
| query (str): The query with which to search the corpus. | |
| Returns: | |
| polars.DataFrame: Corpus documents ranked by their match to the query. | |
| """ | |
| return sbert_query(query, corpus_embeddings_df, model) | |
| return do_sbert_query | |
| def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame: | |
| """ | |
| Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory. | |
| Parameters: | |
| embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). | |
| Returns: | |
| polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].) | |
| """ | |
| # import the block embeddings | |
| files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*") | |
| block_embeddings_list = list() | |
| for filename in files: | |
| print("Reading:", filename) | |
| block_embeddings_list.append(pl.read_csv(filename)) | |
| return pl.concat(block_embeddings_list, how = 'vertical') | |
| def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]: | |
| """ | |
| Create a function that compares query text to the corpus by matching vector space embeddings. | |
| Parameters: | |
| embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). | |
| model_name (str): Name of model used to calulate embeddings. | |
| device (str): Device on which to do the calculations. | |
| Returns: | |
| Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. | |
| """ | |
| # Instantiate the sentence-transformer model: | |
| sentence_model = SentenceTransformer(model_name).to(device = device) | |
| # import the embeddings CSVs | |
| block_embeddings_df = load_embeddings_dfs(embeddings_dir) | |
| # call the factory to make the search function and return it | |
| return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model) | |
| def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]: | |
| """ | |
| Create a function that compares query text to the corpus by matching vector space embeddings. | |
| Parameters: | |
| embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). | |
| model_name (str): Name of model used to calulate embeddings. | |
| device (str): Device on which to do the calculations. | |
| Returns: | |
| Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. | |
| """ | |
| # Instantiate the sentence-transformer model: | |
| sentence_model = SentenceTransformer(model_name).to(device = device) | |
| # import the embeddings CSVs | |
| block_embeddings_df = pl.read_parquet(embeddings_df_path) | |
| # call the factory to make the search function and return it | |
| return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model) | |