Wikidemo doesn't work for me

So I tried the wiki demo on the example page.


I just fails to add the document, got no Idea why since it looks fine

The Marqo docker itself runs fine on second gpu

 sudo docker run --rm --name marqo --gpus '"device=1"' -it -p 8882:8882 marqoai/marqo:latest

I shortened the amount of entries in the .json file to 500 and after it finished going through the 10 batches of 50, the documents did not get added.
When searched for I receive an error traceback:
image

Modified code:

Oh and simply adding documents without .json file:

responses = client.index(index_name).add_documents([
     {
        "Text1": "The Travels of Marco Polo",
        "Text2": "A 13th-century travelogue describing the travels of Polo",
        "_id": "Document 1",
    },
    {
        "Text1": "Extravehicular Mobility Unit (EMU)",
        "Text2": "The EMU is a spacesuit that provides environmental protection",
        "_id": "Document 2",
    }

Does work for me.

Hi @DuckY-Y, we are in the process of updating these example for our 2.0.0 release so thanks for spotting this issue. Could you try this updated script?

#####################################################
### STEP 0. Import and define any helper functions
#####################################################

import marqo
import numpy as np
import json
import pprint
import copy
import math


def read_json(filename: str) -> dict:
    # reads a json file
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    for d in data:
        del d["docDate"]
    return data


def replace_title(data: dict) -> dict:
    # removes the wikipedia from the title for better matching
    data["title"] = data["title"].replace("- Wikipedia", "")
    return data


def split_big_docs(data, field="content", char_len=5e4):
    # there are some large documents which can cause issues for some users
    new_data = []
    for dat in data:
        content = dat[field]
        N = len(content)

        if N >= char_len:
            n_chunks = math.ceil(N / char_len)
            new_content = np.array_split(list(content), n_chunks)

            for _content in new_content:
                new_dat = copy.deepcopy(dat)
                new_dat[field] = "".join(_content)
                new_data.append(new_dat)
        else:
            new_data.append(dat)
    return new_data


#####################################################
### STEP 1. load the data
#####################################################

# download the json formatted simplewiki from here -
# https://www.kaggle.com/datasets/louisgeisler/simple-wiki?resource=download
# or from
# https://drive.google.com/file/d/1OEqXeIdqaZb6BwzKIgw8G_sDi91fBawt/view?usp=sharing
dataset_file = "simplewiki.json"

# get the data
data = read_json(dataset_file)
# clean up the title
data = [replace_title(d) for d in data]
# split big ones to make it easier for users on all hardware
data = split_big_docs(data)
print(f"loaded data with {len(data)} entries")

#####################################################
### STEP 2. start Marqo
#####################################################

# Follow the instructions here https://github.com/marqo-ai/marqo

#####################################################
### STEP 3. index some data with marqo
#####################################################

# we use an index name. the index name needs to be lower case.
index_name = "marqo-simplewiki-demo-all"

# setup the client
mq = marqo.Client()

# we create the index. Note if it already exists an error will occur
# as you cannot overwrite an existing index
# try:
#     mq.delete_index(index_name)
# except:
#     pass

# we create the index and can set the model we want to use
mq.create_index(index_name, model="sentence-transformers/all-MiniLM-L6-v2")

device = "cpu"

responses = mq.index(index_name).add_documents(
    data, device=device, client_batch_size=20, tensor_fields=["content"]
)

# optionally take a look at the responses
pprint.pprint(responses)

#######################################
### STEP 4. Searching with marqo ######
#######################################


# after indexing we can search using both keyword (lexical) and neural search
# this will perform neural search across all indexed fields

# lets create a query
query = "what is air made of?"

results = mq.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])

# we also get highlighting which tells us why this article was returned
pprint.pprint(results["hits"][0]["_highlights"])

# we can restrict the search to specific fields as well
results = mq.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])

# we use lexical search instead of tensor search
results = mq.index(index_name).search(
    query, search_method="LEXICAL"
)

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])


# lets create another query
query = "what is a cube?"

results = mq.index(index_name).search(query)

# we can check the results - lets look at the top hit
pprint.pprint(results["hits"][0])

# we also get highlighting which tells us why this article was returned
pprint.pprint(results["hits"][0]["_highlights"])
1 Like

Thanks!
That did the trick


Do be warned, the current simplewiki.json file has like 188000 entries and with a batch size of 20, it will take with an average of 1.5s per batch a bit less than 4 hours… I guess this does result in a great benchmark and great real use case representation.

1 Like

Due to the text chunking that Marqo does internally it should create a few hundred thousand vectors in the index (vectors for each text chunk in each document, it’s been a while since I ran the whole thing so I can’t remember the exact number).