Spaces:

wbrooks
/

CoUL-document-search

Sleeping

File size: 6,010 Bytes

68fd999
 
 
 
c795cd4
 
 
 
68fd999
 
 
 
c795cd4
 
68fd999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c795cd4
68fd999
 
 
 
 
 
6b6def4
68fd999
 
6b6def4
 
68fd999
 
 
6b6def4
68fd999
 
 
6b6def4
68fd999
 
 
6b6def4
68fd999
 
 
c795cd4
68fd999
 
 
 
 
 
bd1c23b
 
 
c795cd4
 
68fd999
 
 
c795cd4
68fd999
 
c795cd4
68fd999
 
 
c795cd4
68fd999
 
c795cd4
68fd999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c795cd4
68fd999
 
c795cd4
68fd999
 
c795cd4
 
 
68fd999
 
 
c795cd4
68fd999
 
 
 
c795cd4
68fd999
 
c795cd4
68fd999
b6127b6
68fd999
 
b6127b6
68fd999
 
b6127b6
68fd999
 
c795cd4

# This script defines functions that search the corpus for blocks that are similar to the query.
# Loading embeddings of the query had to be changed for deployment in production because
# my CSVs took too much space for the free tier of HuggingFace spaces.

# import packages
import numpy as np
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import glob
from collections.abc import Callable
import os


def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
    """
    Calculate the cosine similarity of the query to each block of text from the corpus.
    
    Parameters:
        query (str): Text of the query to search for in the documents.
        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.

    Returns:
        polars.DataFrame: Corpus documents ranked by their match to the query.
    """
    query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
    
    sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))

    sorted_df = pl.DataFrame(
        {
            'score': np.reshape(sbert_scores, shape=-1),
            'file': corpus_embeddings_df['file'],
            'doc_block_indx': corpus_embeddings_df['doc_block_indx']
        }).group_by("file").agg(pl.col("score").max())

    # sort the results and return
    return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))


def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
    """

    def do_sbert_query(query: str) -> pl.DataFrame:
        """
        Compare the query to the corpus.

        Parameters:
            query (str): The query with which to search the corpus.
        
        Returns:
            polars.DataFrame: Corpus documents ranked by their match to the query.
        """
        return sbert_query(query, corpus_embeddings_df, model)
    
    return do_sbert_query


def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
    """
    Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).

    Returns:
        polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
    """

    # import the block embeddings
    files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*") 

    block_embeddings_list = list()
    for filename in files:
        print("Reading:", filename)
        block_embeddings_list.append(pl.read_csv(filename))

    return pl.concat(block_embeddings_list, how = 'vertical')



def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model_name (str): Name of model used to calulate embeddings.
        device (str): Device on which to do the calculations.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

    """
    # Instantiate the sentence-transformer model:
    sentence_model = SentenceTransformer(model_name).to(device = device)

    # import the embeddings CSVs
    block_embeddings_df = load_embeddings_dfs(embeddings_dir)

    # call the factory to make the search function and return it
    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)



def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model_name (str): Name of model used to calulate embeddings.
        device (str): Device on which to do the calculations.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

    """

    # Instantiate the sentence-transformer model:
    sentence_model = SentenceTransformer(model_name).to(device = device)

    # import the embeddings CSVs
    block_embeddings_df = pl.read_parquet(embeddings_df_path)

    # call the factory to make the search function and return it
    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)