import json import os from opensearchpy import OpenSearch from sentence_transformers import SentenceTransformer model = SentenceTransformer("all-MiniLM-L6-v2") host = "localhost" port = 9200 OPENSEARCH_ADMIN_PASSWORD = os.getenv("OPENSEARCH_ADMIN_PASSWORD", "yw7L5u9nLs3a") auth = ( "admin", OPENSEARCH_ADMIN_PASSWORD, ) # Create the client with SSL/TLS enabled, but hostname verification disabled. client = OpenSearch( hosts=[{"host": host, "port": port}], http_compress=True, # enables gzip compression for request bodies http_auth=auth, use_ssl=True, verify_certs=False, ssl_assert_hostname=False, ssl_show_warn=False, ) with open("datafiniti_properties_sunnyvale_400.json", "r") as f: bulk_body = [] for line in f: property = json.loads(line) try: print(f'indexing {property["address"]}') bathrooms = int(property["numBathroom"]) beds = property["numBedroom"] price = property["mostRecentPriceAmount"] size = property["floorSizeValue"] address = ", ".join( [ property["address"], property["city"], property["province"], property["postalCode"][:5], ] ) descriptions = property["descriptions"] descriptions = sorted( descriptions, key=lambda x: x["dateSeen"], reverse=True ) description = descriptions[0]["value"] row = { "bathrooms": bathrooms, "bedrooms": beds, "listingPrice": price, "squareFootage": size, "address": address, "publicDescription": description, "publicDescriptionKnn": model.encode(description).tolist(), } bulk_body.append({"create": {"_index": "datafiniti_props", "_id": address}}) bulk_body.append(row) except: pass client.bulk( index="datafiniti_props", body=bulk_body, )