# This script defines functions that search the corpus for blocks that are similar to the query. # Loading embeddings of the query had to be changed for deployment in production because # my CSVs took too much space for the free tier of HuggingFace spaces. # import packages import numpy as np import polars as pl from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import glob from collections.abc import Callable import os def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame: """ Calculate the cosine similarity of the query to each block of text from the corpus. Parameters: query (str): Text of the query to search for in the documents. corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). model (sentence_transformers.SentenceTransformer): The model used to encode the sentences. Returns: polars.DataFrame: Corpus documents ranked by their match to the query. """ query_embeddings = np.reshape(model.encode(query), shape = (1, -1)) sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx']))) sorted_df = pl.DataFrame( { 'score': np.reshape(sbert_scores, shape=-1), 'file': corpus_embeddings_df['file'], 'doc_block_indx': corpus_embeddings_df['doc_block_indx'] }).group_by("file").agg(pl.col("score").max()) # sort the results and return return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])) def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]: """ Create a function that compares query text to the corpus by matching vector space embeddings. Parameters: corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings. Returns: Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. """ def do_sbert_query(query: str) -> pl.DataFrame: """ Compare the query to the corpus. Parameters: query (str): The query with which to search the corpus. Returns: polars.DataFrame: Corpus documents ranked by their match to the query. """ return sbert_query(query, corpus_embeddings_df, model) return do_sbert_query def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame: """ Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory. Parameters: embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). Returns: polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].) """ # import the block embeddings files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*") block_embeddings_list = list() for filename in files: print("Reading:", filename) block_embeddings_list.append(pl.read_csv(filename)) return pl.concat(block_embeddings_list, how = 'vertical') def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]: """ Create a function that compares query text to the corpus by matching vector space embeddings. Parameters: embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). model_name (str): Name of model used to calulate embeddings. device (str): Device on which to do the calculations. Returns: Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. """ # Instantiate the sentence-transformer model: sentence_model = SentenceTransformer(model_name).to(device = device) # import the embeddings CSVs block_embeddings_df = load_embeddings_dfs(embeddings_dir) # call the factory to make the search function and return it return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model) def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]: """ Create a function that compares query text to the corpus by matching vector space embeddings. Parameters: embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features). model_name (str): Name of model used to calulate embeddings. device (str): Device on which to do the calculations. Returns: Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity. """ # Instantiate the sentence-transformer model: sentence_model = SentenceTransformer(model_name).to(device = device) # import the embeddings CSVs block_embeddings_df = pl.read_parquet(embeddings_df_path) # call the factory to make the search function and return it return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)