Skip to content

IR module

Document Retrieval module.

bm25(query, documents, k=5, b=0.75, k1=1.5)

Implement BM25 algorithm to retrieve top k similar documents to a query.

query (str): The query string. documents (list): A list of document strings. k (int, optional): The number of top similar documents to retrieve. Defaults to 5. b (float, optional): The parameter controlling the impact of document length on BM25 score. Defaults to 0.75. k1 (float, optional): The parameter controlling the impact of term frequency on BM25 score. Defaults to 1.5.

Source code in intellikit/ir.py
def bm25(query, documents, k=5, b=0.75, k1=1.5):
    """
    Implement BM25 algorithm to retrieve top k similar documents to a query.

    Parameters:
    query (str): The query string.
    documents (list): A list of document strings.
    k (int, optional): The number of top similar documents to retrieve. Defaults to 5.
    b (float, optional): The parameter controlling the impact of document length on BM25 score. Defaults to 0.75.
    k1 (float, optional): The parameter controlling the impact of term frequency on BM25 score. Defaults to 1.5.

    Returns:
    list: A list of tuples containing the top k similar documents and their corresponding BM25 scores.
    """
    # Tokenize query and documents
    query_tokens = query.lower().split()
    document_tokens = [doc.lower().split() for doc in documents]

    # Create vocabulary
    vocabulary = list(set(query_tokens))
    for doc_tokens in document_tokens:
        vocabulary.extend(doc_tokens)
    vocabulary = list(set(vocabulary))

    # Calculate document lengths
    doc_lengths = np.array([len(doc_tokens) for doc_tokens in document_tokens])

    # Create term frequency (TF) matrix
    tf_matrix = np.zeros((len(documents), len(vocabulary)))
    for i, doc_tokens in enumerate(document_tokens):
        for token in doc_tokens:
            tf_matrix[i, vocabulary.index(token)] += 1

    # Calculate document frequency (DF) vector
    df_vector = np.zeros(len(vocabulary))
    for token in query_tokens:
        df_vector[vocabulary.index(token)] += 1

    # Calculate inverse document frequency (IDF) vector
    idf_vector = np.log((len(documents) - df_vector + 0.5) / (df_vector + 0.5))

    # Calculate average document length
    avg_doc_length = np.mean(doc_lengths)

    # Calculate BM25 scores
    scores = []
    for i in range(len(documents)):
        tf = tf_matrix[i]
        doc_length = doc_lengths[i]
        score = np.sum(idf_vector * tf * (k1 + 1) / (tf + k1 * (1 - b + b * doc_length / avg_doc_length)))
        scores.append((i, score))

    # Sort documents by BM25 score
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return top k similar documents
    top_k_similar_docs = []
    for i in range(min(k, len(scores))):
        doc_index = scores[i][0]
        top_k_similar_docs.append((documents[doc_index], scores[i][1]))

    return top_k_similar_docs

cosine_similarity(v1, v2)

Compute the cosine similarity between two vectors.

v1 (numpy.ndarray): The first vector. v2 (numpy.ndarray): The second vector.

Source code in intellikit/ir.py
def cosine_similarity(v1, v2):
    """
    Compute the cosine similarity between two vectors.

    Parameters:
    v1 (numpy.ndarray): The first vector.
    v2 (numpy.ndarray): The second vector.

    Returns:
    float: The cosine similarity between the two vectors.
    """
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

sentence_transformers_retrieval(query, documents, k=5, model_name='paraphrase-MiniLM-L6-v2')

Apply sentence transformers to retrieve top k similar documents.

Parameters:

Name Type Description Default
query text

A search query for a document.

required
documents list

A list of documents for the search.

required
k int

Number of top results to return. Defaults to 5.

5
model_name str

Open source sentence model to use. Defaults to 'paraphrase-MiniLM-L6-v2'.

'paraphrase-MiniLM-L6-v2'

Returns:

Type Description
list

A list of documents relevant to the search query.

Source code in intellikit/ir.py
def sentence_transformers_retrieval(query, documents, k=5, model_name='paraphrase-MiniLM-L6-v2'):
    """
    Apply sentence transformers to retrieve top k similar documents.

    Args:
        query (text): A search query for a document.
        documents (list): A list of documents for the search.
        k (int, optional): Number of top results to return. Defaults to 5.
        model_name (str, optional): Open source sentence model to use. Defaults to 'paraphrase-MiniLM-L6-v2'.

    Returns:
        list: A list of documents relevant to the search query.
    """
    # Load pre-trained sentence transformer model. Specify the model using model_name
    model = SentenceTransformer(model_name)

    # Encode query and documents into embeddings
    query_embedding = model.encode([query])[0]
    document_embeddings = model.encode(documents)

    # Calculate cosine similarity between query and documents
    similarities = [cosine_similarity(query_embedding, doc_embedding) for doc_embedding in document_embeddings]

    # Sort documents by similarity score
    similarities_with_indices = list(enumerate(similarities))
    similarities_with_indices.sort(key=lambda x: x[1], reverse=True)

    # Return top k similar documents
    top_k_similar_docs = []
    for i in range(min(k, len(similarities_with_indices))):
        doc_index = similarities_with_indices[i][0]
        top_k_similar_docs.append((documents[doc_index], similarities_with_indices[i][1]))

    return top_k_similar_docs

vector_space_model(query, documents, k=5)

Calculates the top k similar documents to a given query using the Vector Space Model.

Parameters:

Name Type Description Default
query str

The query string.

required
documents list

A list of document strings.

required
k int

The number of similar documents to return. Defaults to 5.

5

Returns:

Type Description
list

A list of tuples containing the top k similar documents and their similarity scores.

Source code in intellikit/ir.py
def vector_space_model(query, documents, k=5):
    """
    Calculates the top k similar documents to a given query using the Vector Space Model.

    Args:
        query (str): The query string.
        documents (list): A list of document strings.
        k (int, optional): The number of similar documents to return. Defaults to 5.

    Returns:
        list: A list of tuples containing the top k similar documents and their similarity scores.
    """
    # Tokenize query and documents
    query_tokens = query.lower().split()
    document_tokens = [doc.lower().split() for doc in documents]

    # Create vocabulary
    vocabulary = list(set(query_tokens))
    for doc_tokens in document_tokens:
        vocabulary.extend(doc_tokens)
    vocabulary = list(set(vocabulary))

    # Create term frequency (TF) matrix
    tf_matrix = np.zeros((len(documents), len(vocabulary)))
    for i, doc_tokens in enumerate(document_tokens):
        for token in doc_tokens:
            tf_matrix[i, vocabulary.index(token)] += 1

    # Create document frequency (DF) vector
    df_vector = np.zeros(len(vocabulary))
    for token in query_tokens:
        df_vector[vocabulary.index(token)] += 1

    # Calculate inverse document frequency (IDF) vector
    idf_vector = np.log(len(documents) / (df_vector + 1))

    # Calculate TF-IDF matrix
    tfidf_matrix = tf_matrix * idf_vector

    # Calculate query vector
    query_vector = np.zeros(len(vocabulary))
    for token in query_tokens:
        if token in vocabulary:
            query_vector[vocabulary.index(token)] += 1
    query_vector *= idf_vector

    # Calculate cosine similarity between query vector and document vectors
    similarities = []
    for i in range(len(documents)):
        sim = cosine_similarity(query_vector, tfidf_matrix[i])
        similarities.append((i, sim))

    # Sort documents by similarity score
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Return top k similar documents
    top_k_similar_docs = []
    for i in range(min(k, len(similarities))):
        doc_index = similarities[i][0]
        top_k_similar_docs.append((documents[doc_index], similarities[i][1]))

    return top_k_similar_docs