InformationRetrieval/Codes/Code-1.1.py

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import nltk

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")

def process(file):
    # Read the file
    raw = open(file).read()

    # Tokenize the raw text
    tokens = word_tokenize(raw)
    words = [w.lower() for w in tokens]

    # Stem the tokens
    porter = nltk.PorterStemmer()
    stemmed_tokens = [porter.stem(t) for t in words]

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]

    # Count words
    count = nltk.defaultdict(int)
    for word in filtered_tokens:
        count[word] += 1

    return count

def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    return dot_product / (norm_a * norm_b)

def getSimilarity(dict1, dict2):
    all_words_list = []

    # Collect all unique words from both dictionaries
    for key in dict1:
        all_words_list.append(key)

    for key in dict2:
        all_words_list.append(key)

    all_words_list_size = len(all_words_list)
    v1 = np.zeros(all_words_list_size, dtype=np.int)
    v2 = np.zeros(all_words_list_size, dtype=np.int)

    # Create vectors for the dictionaries
    for i, key in enumerate(all_words_list):
        v1[i] = dict1.get(key, 0)
        v2[i] = dict2.get(key, 0)

    return cos_sim(v1, v2)

if __name__ == '__main__':
    dict1 = process("text1.txt")
    dict2 = process("text2.txt")

    print("Similarity between two text documents:", getSimilarity(dict1, dict2))