Document Search System

In [1]:

Copied!





#Loading libraries. Ensure that you already have Levenshtein and sentence_transformers already installed
# !pip install Levenshtein
# !pip install sentence_transformers
import intellikit as ik
import pandas as pd
import numpy as np
#Loading libraries. Ensure that you already have Levenshtein and sentence_transformers already installed
# !pip install Levenshtein
# !pip install sentence_transformers
import intellikit as ik
import pandas as pd
import numpy as np

/home/runner/.local/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from tqdm.autonotebook import tqdm, trange

In [2]:

Copied!





# Example documents and query
documents = [
    "This is an example document about natural language processing.",
    "Python is a popular programming language for machine learning tasks.",
    "Machine learning algorithms can be implemented in various programming languages.",
    "Natural language processing helps computers understand human language.",
    "Deep learning is a subset of machine learning algorithms.",
]

query = "programming languages for machine learning"
# Example documents and query
documents = [
    "This is an example document about natural language processing.",
    "Python is a popular programming language for machine learning tasks.",
    "Machine learning algorithms can be implemented in various programming languages.",
    "Natural language processing helps computers understand human language.",
    "Deep learning is a subset of machine learning algorithms.",
]

query = "programming languages for machine learning"

In [3]:

Copied!





#Using the Vector Space Model (TF-IDF)
top_similar_docs_vsm = ik.vector_space_model(query, documents, k=3)

#To print the retrieved documents
print("Top 3 similar documents:")
for doc, similarity in top_similar_docs_vsm:
    print(f"Similarity: {similarity:.4f} - Document: {doc}")
#Using the Vector Space Model (TF-IDF)
top_similar_docs_vsm = ik.vector_space_model(query, documents, k=3)

#To print the retrieved documents
print("Top 3 similar documents:")
for doc, similarity in top_similar_docs_vsm:
    print(f"Similarity: {similarity:.4f} - Document: {doc}")

Top 3 similar documents:
Similarity: 0.3770 - Document: Python is a popular programming language for machine learning tasks.
Similarity: 0.2767 - Document: Deep learning is a subset of machine learning algorithms.
Similarity: 0.2705 - Document: Machine learning algorithms can be implemented in various programming languages.

In [4]:

Copied!





# Using the BM25 Model
top_similar_docs_bm25 = ik.bm25(query, documents, k=3)

#To print the retrieved documents
print("Top 3 similar documents using BM25:")
for doc, score in top_similar_docs_bm25:
    print(f"Score: {score:.4f} - Document: {doc}")
# Using the BM25 Model
top_similar_docs_bm25 = ik.bm25(query, documents, k=3)

#To print the retrieved documents
print("Top 3 similar documents using BM25:")
for doc, score in top_similar_docs_bm25:
    print(f"Score: {score:.4f} - Document: {doc}")

Top 3 similar documents using BM25:
Score: 21.7943 - Document: This is an example document about natural language processing.
Score: 20.3793 - Document: Natural language processing helps computers understand human language.
Score: 19.3249 - Document: Machine learning algorithms can be implemented in various programming languages.

In [5]:

Copied!





# Using Sentence Transformers
# The default model used here is the pre-trained "paraphrase-MiniLM-L6-v2" from huggingface hb. Other pre-trained models can be applied.
top_similar_docs_st = ik.sentence_transformers_retrieval(query, documents, k=3, model_name='paraphrase-MiniLM-L6-v2')

# To print the retrieved documents
print("Top 3 similar documents using Sentence Transformers:")
for doc, similarity in top_similar_docs_st:
    print(f"Similarity: {similarity:.4f} - Document: {doc}")
# Using Sentence Transformers
# The default model used here is the pre-trained "paraphrase-MiniLM-L6-v2" from huggingface hb. Other pre-trained models can be applied.
top_similar_docs_st = ik.sentence_transformers_retrieval(query, documents, k=3, model_name='paraphrase-MiniLM-L6-v2')

# To print the retrieved documents
print("Top 3 similar documents using Sentence Transformers:")
for doc, similarity in top_similar_docs_st:
    print(f"Similarity: {similarity:.4f} - Document: {doc}")

Top 3 similar documents using Sentence Transformers:
Similarity: 0.7972 - Document: Machine learning algorithms can be implemented in various programming languages.
Similarity: 0.6459 - Document: Python is a popular programming language for machine learning tasks.
Similarity: 0.5925 - Document: Deep learning is a subset of machine learning algorithms.

In [6]:

Copied!





# An example of preparing a dataframme for this task
data = {
    'document_id': [1, 2, 3, 4, 5],
    'document_title': ["Document 1", "Document 2", "Document 3", "Document 4", "Document 5"],
    'document_text': [
        "This is the text of Document 1.",
        "Document 2 contains some example text.",
        "The text in Document 3 is different from others.",
        "Document 4 has unique content.",
        "This is a sample text for Document 5."
    ]
}

df = pd.DataFrame(data)

# Function to prepare data for retrieval models
def prepare_data(df):
    documents = df['document_text'].tolist()
    titles = df['document_title'].tolist()
    return documents, titles

# Example usage
documents, titles = prepare_data(df)

# Test the BM25 function
query = "example text"
top_similar_docs_bm25 = ik.bm25(query, documents, k=3)
print("Top 3 similar documents using BM25:")
for doc, score in top_similar_docs_bm25:
    doc_title = titles[documents.index(doc)]
    print(f"Score: {score:.4f} - Document Title: {doc_title}")

# Test the Vector Space Model function
top_similar_docs_vsm = ik.vector_space_model(query, documents, k=3)
print("\nTop 3 similar documents using the Vector Space Model:")
for doc, similarity in top_similar_docs_vsm:
    doc_title = titles[documents.index(doc)]
    print(f"Similarity: {similarity:.4f} - Document Title: {doc_title}")
# An example of preparing a dataframme for this task
data = {
    'document_id': [1, 2, 3, 4, 5],
    'document_title': ["Document 1", "Document 2", "Document 3", "Document 4", "Document 5"],
    'document_text': [
        "This is the text of Document 1.",
        "Document 2 contains some example text.",
        "The text in Document 3 is different from others.",
        "Document 4 has unique content.",
        "This is a sample text for Document 5."
    ]
}

df = pd.DataFrame(data)

# Function to prepare data for retrieval models
def prepare_data(df):
    documents = df['document_text'].tolist()
    titles = df['document_title'].tolist()
    return documents, titles

# Example usage
documents, titles = prepare_data(df)

# Test the BM25 function
query = "example text"
top_similar_docs_bm25 = ik.bm25(query, documents, k=3)
print("Top 3 similar documents using BM25:")
for doc, score in top_similar_docs_bm25:
    doc_title = titles[documents.index(doc)]
    print(f"Score: {score:.4f} - Document Title: {doc_title}")

# Test the Vector Space Model function
top_similar_docs_vsm = ik.vector_space_model(query, documents, k=3)
print("\nTop 3 similar documents using the Vector Space Model:")
for doc, similarity in top_similar_docs_vsm:
    doc_title = titles[documents.index(doc)]
    print(f"Similarity: {similarity:.4f} - Document Title: {doc_title}")

Top 3 similar documents using BM25:
Score: 17.9712 - Document Title: Document 3
Score: 16.8036 - Document Title: Document 5
Score: 15.4860 - Document Title: Document 1

Top 3 similar documents using the Vector Space Model:
Similarity: 0.1745 - Document Title: Document 2
Similarity: 0.1601 - Document Title: Document 1
Similarity: 0.1488 - Document Title: Document 5