# This script defines functions that search the corpus for blocks that are similar to the query.
# Loading embeddings of the query had to be changed for deployment in production because
# my CSVs took too much space for the free tier of HuggingFace spaces.

# import packages
import numpy as np
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import glob
from collections.abc import Callable
import os


def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
    """
    Calculate the cosine similarity of the query to each block of text from the corpus.
    
    Parameters:
        query (str): Text of the query to search for in the documents.
        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.

    Returns:
        polars.DataFrame: Corpus documents ranked by their match to the query.
    """
    query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
    
    sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))

    sorted_df = pl.DataFrame(
        {
            'score': np.reshape(sbert_scores, shape=-1),
            'file': corpus_embeddings_df['file'],
            'doc_block_indx': corpus_embeddings_df['doc_block_indx']
        }).group_by("file").agg(pl.col("score").max())

    # sort the results and return
    return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))


def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
    """

    def do_sbert_query(query: str) -> pl.DataFrame:
        """
        Compare the query to the corpus.

        Parameters:
            query (str): The query with which to search the corpus.
        
        Returns:
            polars.DataFrame: Corpus documents ranked by their match to the query.
        """
        return sbert_query(query, corpus_embeddings_df, model)
    
    return do_sbert_query


def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
    """
    Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).

    Returns:
        polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
    """

    # import the block embeddings
    files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*") 

    block_embeddings_list = list()
    for filename in files:
        print("Reading:", filename)
        block_embeddings_list.append(pl.read_csv(filename))

    return pl.concat(block_embeddings_list, how = 'vertical')


def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model_name (str): Name of model used to calulate embeddings.
        device (str): Device on which to do the calculations.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

    """
    # Instantiate the sentence-transformer model:
    sentence_model = SentenceTransformer(model_name).to(device = device)

    # import the embeddings CSVs
    block_embeddings_df = load_embeddings_dfs(embeddings_dir)

    # call the factory to make the search function and return it
    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)


def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
    """
    Create a function that compares query text to the corpus by matching vector space embeddings.

    Parameters:
        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
        model_name (str): Name of model used to calulate embeddings.
        device (str): Device on which to do the calculations.

    Returns:
        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.

    """

    # Instantiate the sentence-transformer model:
    sentence_model = SentenceTransformer(model_name).to(device = device)

    # import the embeddings CSVs
    block_embeddings_df = pl.read_parquet(embeddings_df_path)

    # call the factory to make the search function and return it
    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)