CoUL-document-search / src /embeddings_search.py
wbrooks's picture
removed debugging messages now that search is working
b6127b6
# This script defines functions that search the corpus for blocks that are similar to the query.
# Loading embeddings of the query had to be changed for deployment in production because
# my CSVs took too much space for the free tier of HuggingFace spaces.
# import packages
import numpy as np
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import glob
from collections.abc import Callable
import os
def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
"""
Calculate the cosine similarity of the query to each block of text from the corpus.
Parameters:
query (str): Text of the query to search for in the documents.
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.
Returns:
polars.DataFrame: Corpus documents ranked by their match to the query.
"""
query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
sorted_df = pl.DataFrame(
{
'score': np.reshape(sbert_scores, shape=-1),
'file': corpus_embeddings_df['file'],
'doc_block_indx': corpus_embeddings_df['doc_block_indx']
}).group_by("file").agg(pl.col("score").max())
# sort the results and return
return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))
def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
def do_sbert_query(query: str) -> pl.DataFrame:
"""
Compare the query to the corpus.
Parameters:
query (str): The query with which to search the corpus.
Returns:
polars.DataFrame: Corpus documents ranked by their match to the query.
"""
return sbert_query(query, corpus_embeddings_df, model)
return do_sbert_query
def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
"""
Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
Returns:
polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
"""
# import the block embeddings
files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")
block_embeddings_list = list()
for filename in files:
print("Reading:", filename)
block_embeddings_list.append(pl.read_csv(filename))
return pl.concat(block_embeddings_list, how = 'vertical')
def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model_name (str): Name of model used to calulate embeddings.
device (str): Device on which to do the calculations.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
# Instantiate the sentence-transformer model:
sentence_model = SentenceTransformer(model_name).to(device = device)
# import the embeddings CSVs
block_embeddings_df = load_embeddings_dfs(embeddings_dir)
# call the factory to make the search function and return it
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model_name (str): Name of model used to calulate embeddings.
device (str): Device on which to do the calculations.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
# Instantiate the sentence-transformer model:
sentence_model = SentenceTransformer(model_name).to(device = device)
# import the embeddings CSVs
block_embeddings_df = pl.read_parquet(embeddings_df_path)
# call the factory to make the search function and return it
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)