InformationRetrieval/Notebooks/Code-1.1/Code-1.1 (Document Similarity).ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
   "metadata": {},
   "source": [
    "# Practical-1.1\n",
    "\n",
    "Problem Statement: Write a program to Compute Similarity between two text documents.\n",
    "\n",
    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "efe12052-a191-4760-9a75-a08d82b3d334",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import numpy as np\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package punkt_tab to\n",
      "[nltk_data]     /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Download necessary NLTK data\n",
    "nltk.download(\"punkt\")\n",
    "nltk.download(\"stopwords\")\n",
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Contents of text1.txt:\n",
      "This is a sample document. It contains text for testing the similarity.\n",
      "\n",
      "\n",
      "Contents of text2.txt:\n",
      "This document is a sample. It includes text to test the similarity.\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print contents of the two documents\n",
    "def print_file_content(file):\n",
    "    with open(file, 'r') as f:\n",
    "        content = f.read()\n",
    "        print(content)\n",
    "\n",
    "print(\"Contents of text1.txt:\")\n",
    "print_file_content(\"text1.txt\")\n",
    "print(\"Contents of text2.txt:\")\n",
    "print_file_content(\"text2.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(file):\n",
    "    # Read the file\n",
    "    raw = open(file).read()\n",
    "    \n",
    "    # Tokenize the raw text\n",
    "    tokens = word_tokenize(raw)\n",
    "    words = [w.lower() for w in tokens]\n",
    "    \n",
    "    # Stem the tokens\n",
    "    porter = nltk.PorterStemmer()\n",
    "    stemmed_tokens = [porter.stem(t) for t in words]\n",
    "\n",
    "    # Removing stop words\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
    "    \n",
    "    # Count words\n",
    "    count = nltk.defaultdict(int)\n",
    "    for word in filtered_tokens:\n",
    "        count[word] += 1\n",
    "        \n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cos_sim(a, b):\n",
    "    dot_product = np.dot(a, b)\n",
    "    norm_a = np.linalg.norm(a)\n",
    "    norm_b = np.linalg.norm(b)\n",
    "    \n",
    "    return dot_product / (norm_a * norm_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def getSimilarity(dict1, dict2):\n",
    "    all_words_list = []\n",
    "    \n",
    "    # Collect all unique words from both dictionaries\n",
    "    for key in dict1:\n",
    "        all_words_list.append(key)\n",
    "        \n",
    "    for key in dict2:\n",
    "        all_words_list.append(key)\n",
    "        \n",
    "    all_words_list_size = len(all_words_list)\n",
    "    v1 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
    "    v2 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
    "    \n",
    "    # Create vectors for the dictionaries\n",
    "    for i, key in enumerate(all_words_list):\n",
    "        v1[i] = dict1.get(key, 0)\n",
    "        v2[i] = dict2.get(key, 0)\n",
    "        \n",
    "    return cos_sim(v1, v2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity between two text documents: 0.9523809523809523\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    dict1 = process(\"text1.txt\")\n",
    "    dict2 = process(\"text2.txt\")\n",
    "    \n",
    "    print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}