opensearch-project
/

opensearch-semantic-highlighter-v1

@@ -1,3 +1,18 @@
 # opensearch-semantic-highlighter
 ## Overview
@@ -93,7 +108,7 @@ def prepare_input_features(
 # example highlighting case, from OpenSearch documentation
 query = "When does OpenSearch use text reanalysis for highlighting?"
-document = "To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term’s position in the original text. The highlighter can obtain the offsets from the following sources: Postings: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the index_options parameter to offsets when mapping a text field, OpenSearch adds each term’s start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. Text reanalysis: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene’s query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields."
 # sentence-level parsing
 sentence_ids = []

+---
+language: en
+license: apache-2.0
+library_name: transformers
+tags:
+- opensearch
+- semantic-search
+- highlighting
+- sentence-highlighter
+- bert
+- text-classification
+- pytorch
+pipeline_tag: text-classification
+---
 # opensearch-semantic-highlighter
 ## Overview
 # example highlighting case, from OpenSearch documentation
 query = "When does OpenSearch use text reanalysis for highlighting?"
+document = "To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term's position in the original text. The highlighter can obtain the offsets from the following sources: Postings: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the index_options parameter to offsets when mapping a text field, OpenSearch adds each term's start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. Text reanalysis: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene's query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields."
 # sentence-level parsing
 sentence_ids = []