Spaces:

wbrooks
/

CoUL-document-search

Running

App Files Files Community

CoUL-document-search / src /embeddings_search.py

wbrooks

removed debugging messages now that search is working

b6127b6 6 days ago

raw

history blame contribute delete

6.01 kB

	# This script defines functions that search the corpus for blocks that are similar to the query.
	# Loading embeddings of the query had to be changed for deployment in production because
	# my CSVs took too much space for the free tier of HuggingFace spaces.

	# import packages
	import numpy as np
	import polars as pl
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer
	import glob
	from collections.abc import Callable
	import os


	def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
	"""
	Calculate the cosine similarity of the query to each block of text from the corpus.

	Parameters:
	query (str): Text of the query to search for in the documents.
	corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
	model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.

	Returns:
	polars.DataFrame: Corpus documents ranked by their match to the query.
	"""
	query_embeddings = np.reshape(model.encode(query), shape = (1, -1))

	sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))

	sorted_df = pl.DataFrame(
	{
	'score': np.reshape(sbert_scores, shape=-1),
	'file': corpus_embeddings_df['file'],
	'doc_block_indx': corpus_embeddings_df['doc_block_indx']
	}).group_by("file").agg(pl.col("score").max())

	# sort the results and return
	return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))


	def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
	"""
	Create a function that compares query text to the corpus by matching vector space embeddings.

	Parameters:
	corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
	model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.

	Returns:
	Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
	"""

	def do_sbert_query(query: str) -> pl.DataFrame:
	"""
	Compare the query to the corpus.

	Parameters:
	query (str): The query with which to search the corpus.

	Returns:
	polars.DataFrame: Corpus documents ranked by their match to the query.
	"""
	return sbert_query(query, corpus_embeddings_df, model)

	return do_sbert_query


	def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
	"""
	Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.

	Parameters:
	embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).

	Returns:
	polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
	"""

	# import the block embeddings
	files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")

	block_embeddings_list = list()
	for filename in files:
	print("Reading:", filename)
	block_embeddings_list.append(pl.read_csv(filename))

	return pl.concat(block_embeddings_list, how = 'vertical')



	def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
	"""
	Create a function that compares query text to the corpus by matching vector space embeddings.

	Parameters:
	embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
	model_name (str): Name of model used to calulate embeddings.
	device (str): Device on which to do the calculations.

	Returns:
	Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

	"""
	# Instantiate the sentence-transformer model:
	sentence_model = SentenceTransformer(model_name).to(device = device)

	# import the embeddings CSVs
	block_embeddings_df = load_embeddings_dfs(embeddings_dir)

	# call the factory to make the search function and return it
	return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)



	def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
	"""
	Create a function that compares query text to the corpus by matching vector space embeddings.

	Parameters:
	embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
	model_name (str): Name of model used to calulate embeddings.
	device (str): Device on which to do the calculations.

	Returns:
	Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

	"""

	# Instantiate the sentence-transformer model:
	sentence_model = SentenceTransformer(model_name).to(device = device)

	# import the embeddings CSVs
	block_embeddings_df = pl.read_parquet(embeddings_df_path)

	# call the factory to make the search function and return it
	return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)