{ "cells": [ { "cell_type": "markdown", "id": "946b7d7c-1e3a-4421-83ac-48c77a022c18", "metadata": {}, "source": [ "# Practical-1.1\n", "\n", "Problem Statement: Write a program to Compute Similarity between two text documents.\n", "\n", "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n", "\n", "---" ] }, { "cell_type": "code", "execution_count": 29, "id": "efe12052-a191-4760-9a75-a08d82b3d334", "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import numpy as np\n", "import nltk" ] }, { "cell_type": "code", "execution_count": 21, "id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /home/nonroot/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] /home/nonroot/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Download necessary NLTK data\n", "nltk.download(\"punkt\")\n", "nltk.download(\"stopwords\")\n", "nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": 34, "id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contents of text1.txt:\n", "This is a sample document. It contains text for testing the similarity.\n", "\n", "\n", "Contents of text2.txt:\n", "This document is a sample. It includes text to test the similarity.\n", "\n", "\n" ] } ], "source": [ "# Print contents of the two documents\n", "def print_file_content(file):\n", " with open(file, 'r') as f:\n", " content = f.read()\n", " print(content)\n", "\n", "print(\"Contents of text1.txt:\")\n", "print_file_content(\"text1.txt\")\n", "print(\"Contents of text2.txt:\")\n", "print_file_content(\"text2.txt\")" ] }, { "cell_type": "code", "execution_count": 35, "id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392", "metadata": {}, "outputs": [], "source": [ "def process(file):\n", " # Read the file\n", " raw = open(file).read()\n", " \n", " # Tokenize the raw text\n", " tokens = word_tokenize(raw)\n", " words = [w.lower() for w in tokens]\n", " \n", " # Stem the tokens\n", " porter = nltk.PorterStemmer()\n", " stemmed_tokens = [porter.stem(t) for t in words]\n", "\n", " # Removing stop words\n", " stop_words = set(stopwords.words('english'))\n", " filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n", " \n", " # Count words\n", " count = nltk.defaultdict(int)\n", " for word in filtered_tokens:\n", " count[word] += 1\n", " \n", " return count" ] }, { "cell_type": "code", "execution_count": 36, "id": "56f17214-bc46-4eaf-aeed-ce387212c9b1", "metadata": {}, "outputs": [], "source": [ "def cos_sim(a, b):\n", " dot_product = np.dot(a, b)\n", " norm_a = np.linalg.norm(a)\n", " norm_b = np.linalg.norm(b)\n", " \n", " return dot_product / (norm_a * norm_b)" ] }, { "cell_type": "code", "execution_count": 37, "id": "6c1c3993-9909-4cb7-aaa5-a69714667afd", "metadata": {}, "outputs": [], "source": [ "def getSimilarity(dict1, dict2):\n", " all_words_list = []\n", " \n", " # Collect all unique words from both dictionaries\n", " for key in dict1:\n", " all_words_list.append(key)\n", " \n", " for key in dict2:\n", " all_words_list.append(key)\n", " \n", " all_words_list_size = len(all_words_list)\n", " v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n", " v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n", " \n", " # Create vectors for the dictionaries\n", " for i, key in enumerate(all_words_list):\n", " v1[i] = dict1.get(key, 0)\n", " v2[i] = dict2.get(key, 0)\n", " \n", " return cos_sim(v1, v2)" ] }, { "cell_type": "code", "execution_count": 38, "id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Similarity between two text documents: 0.9523809523809523\n" ] } ], "source": [ "if __name__ == '__main__':\n", " dict1 = process(\"text1.txt\")\n", " dict2 = process(\"text2.txt\")\n", " \n", " print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))" ] }, { "cell_type": "markdown", "id": "a32301be-d57c-4892-b0b3-094a05f61f9a", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }