Added jupyter notebooks for 1.1, 1.2, 1.3.

2025-10-12 22:55:54 +05:30
parent a4f5326402
commit 68587b84e0
5 changed files with 994 additions and 0 deletions
@@ -0,0 +1,230 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
+   "metadata": {},
+   "source": [
+    "# Practical-1.1\n",
+    "\n",
+    "Problem Statement: Write a program to Compute Similarity between two text documents.\n",
+    "\n",
+    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "efe12052-a191-4760-9a75-a08d82b3d334",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "import numpy as np\n",
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /home/nonroot/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt_tab to\n",
+      "[nltk_data]     /home/nonroot/nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Download necessary NLTK data\n",
+    "nltk.download(\"punkt\")\n",
+    "nltk.download(\"stopwords\")\n",
+    "nltk.download('punkt_tab')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Contents of text1.txt:\n",
+      "This is a sample document. It contains text for testing the similarity.\n",
+      "\n",
+      "\n",
+      "Contents of text2.txt:\n",
+      "This document is a sample. It includes text to test the similarity.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print contents of the two documents\n",
+    "def print_file_content(file):\n",
+    "    with open(file, 'r') as f:\n",
+    "        content = f.read()\n",
+    "        print(content)\n",
+    "\n",
+    "print(\"Contents of text1.txt:\")\n",
+    "print_file_content(\"text1.txt\")\n",
+    "print(\"Contents of text2.txt:\")\n",
+    "print_file_content(\"text2.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process(file):\n",
+    "    # Read the file\n",
+    "    raw = open(file).read()\n",
+    "    \n",
+    "    # Tokenize the raw text\n",
+    "    tokens = word_tokenize(raw)\n",
+    "    words = [w.lower() for w in tokens]\n",
+    "    \n",
+    "    # Stem the tokens\n",
+    "    porter = nltk.PorterStemmer()\n",
+    "    stemmed_tokens = [porter.stem(t) for t in words]\n",
+    "\n",
+    "    # Removing stop words\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
+    "    \n",
+    "    # Count words\n",
+    "    count = nltk.defaultdict(int)\n",
+    "    for word in filtered_tokens:\n",
+    "        count[word] += 1\n",
+    "        \n",
+    "    return count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cos_sim(a, b):\n",
+    "    dot_product = np.dot(a, b)\n",
+    "    norm_a = np.linalg.norm(a)\n",
+    "    norm_b = np.linalg.norm(b)\n",
+    "    \n",
+    "    return dot_product / (norm_a * norm_b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getSimilarity(dict1, dict2):\n",
+    "    all_words_list = []\n",
+    "    \n",
+    "    # Collect all unique words from both dictionaries\n",
+    "    for key in dict1:\n",
+    "        all_words_list.append(key)\n",
+    "        \n",
+    "    for key in dict2:\n",
+    "        all_words_list.append(key)\n",
+    "        \n",
+    "    all_words_list_size = len(all_words_list)\n",
+    "    v1 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
+    "    v2 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
+    "    \n",
+    "    # Create vectors for the dictionaries\n",
+    "    for i, key in enumerate(all_words_list):\n",
+    "        v1[i] = dict1.get(key, 0)\n",
+    "        v2[i] = dict2.get(key, 0)\n",
+    "        \n",
+    "    return cos_sim(v1, v2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Similarity between two text documents: 0.9523809523809523\n"
+     ]
+    }
+   ],
+   "source": [
+    "if __name__ == '__main__':\n",
+    "    dict1 = process(\"text1.txt\")\n",
+    "    dict2 = process(\"text2.txt\")\n",
+    "    \n",
+    "    print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,2 @@
+This is a sample document. It contains text for testing the similarity.
+
@@ -0,0 +1,2 @@
+This document is a sample. It includes text to test the similarity.
+
@@ -0,0 +1,616 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6",
+   "metadata": {},
+   "source": [
+    "# Practical-1.2\n",
+    "\n",
+    "Problem Statement: Implement Page Rank Algorithm.\n",
+    "\n",
+    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Constants for PageRank\n",
+    "threshold = 1e-13\n",
+    "beta = 0.85"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "25966376-d37f-41ef-a1ca-adbdf5831bd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Spider Trap Network represented as adjacency matrix\n",
+    "A = [\n",
+    "    [0, 0, 1, 0],\n",
+    "    [1, 0, 0, 0],\n",
+    "    [1, 1, 0, 0],\n",
+    "    [1, 1, 0, 1]\n",
+    "]\n",
+    "\n",
+    "# Convert adjacency matrix to a numpy array\n",
+    "arr = np.array(A, dtype=float)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summation of columns:  [3.0, 2.0, 1.0, 1.0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate summation of columns\n",
+    "s = []\n",
+    "for i in range(len(A)):\n",
+    "    s.append(np.sum(arr[:, i]))\n",
+    "\n",
+    "print(\"Summation of columns: \", s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "5f41e472-4f23-4a83-ac92-737581dd566c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Column stochastic probability matrix, M:\n",
+      "[[0.         0.         1.         0.        ]\n",
+      " [0.33333333 0.         0.         0.        ]\n",
+      " [0.33333333 0.5        0.         0.        ]\n",
+      " [0.33333333 0.5        0.         1.        ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create the column stochastic probability matrix, M\n",
+    "M = arr.copy()\n",
+    "for j in range(len(A)):\n",
+    "    if s[j] != 0:  # Prevent division by zero\n",
+    "        M[:, j] = M[:, j] / s[j]\n",
+    "\n",
+    "print(\"Column stochastic probability matrix, M:\")\n",
+    "print(M)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial rank vector:\n",
+      "[[0.25]\n",
+      " [0.25]\n",
+      " [0.25]\n",
+      " [0.25]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize rank vector\n",
+    "r = (1.0 + np.zeros([len(M), 1])) / len(M)\n",
+    "print(\"Initial rank vector:\")\n",
+    "print(r)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate the uniform rank contribution\n",
+    "uniformR = (1.0 - beta) * r\n",
+    "r_prev = r.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  1\n",
+      "The rank vector: \n",
+      "[[0.25      ]\n",
+      " [0.10833333]\n",
+      " [0.21458333]\n",
+      " [0.42708333]]\n",
+      "Iteration:  2\n",
+      "The rank vector: \n",
+      "[[0.21989583]\n",
+      " [0.10833333]\n",
+      " [0.154375  ]\n",
+      " [0.51739583]]\n",
+      "Iteration:  3\n",
+      "The rank vector: \n",
+      "[[0.16871875]\n",
+      " [0.09980382]\n",
+      " [0.14584549]\n",
+      " [0.58563194]]\n",
+      "Iteration:  4\n",
+      "The rank vector: \n",
+      "[[0.16146866]\n",
+      " [0.08530365]\n",
+      " [0.12772027]\n",
+      " [0.62550742]]\n",
+      "Iteration:  5\n",
+      "The rank vector: \n",
+      "[[0.14606223]\n",
+      " [0.08324945]\n",
+      " [0.1195035 ]\n",
+      " [0.65118481]]\n",
+      "Iteration:  6\n",
+      "The rank vector: \n",
+      "[[0.13907798]\n",
+      " [0.0788843 ]\n",
+      " [0.11426532]\n",
+      " [0.66777241]]\n",
+      "Iteration:  7\n",
+      "The rank vector: \n",
+      "[[0.13462552]\n",
+      " [0.07690543]\n",
+      " [0.11043125]\n",
+      " [0.6780378 ]]\n",
+      "Iteration:  8\n",
+      "The rank vector: \n",
+      "[[0.13136657]\n",
+      " [0.0756439 ]\n",
+      " [0.1083287 ]\n",
+      " [0.68466083]]\n",
+      "Iteration:  9\n",
+      "The rank vector: \n",
+      "[[0.1295794 ]\n",
+      " [0.07472053]\n",
+      " [0.10686918]\n",
+      " [0.68883089]]\n",
+      "Iteration:  10\n",
+      "The rank vector: \n",
+      "[[0.12833881]\n",
+      " [0.07421416]\n",
+      " [0.10597039]\n",
+      " [0.69147664]]\n",
+      "Iteration:  11\n",
+      "The rank vector: \n",
+      "[[0.12757483]\n",
+      " [0.07386266]\n",
+      " [0.10540368]\n",
+      " [0.69315883]]\n",
+      "Iteration:  12\n",
+      "The rank vector: \n",
+      "[[0.12709313]\n",
+      " [0.0736462 ]\n",
+      " [0.10503783]\n",
+      " [0.69422284]]\n",
+      "Iteration:  13\n",
+      "The rank vector: \n",
+      "[[0.12678216]\n",
+      " [0.07350972]\n",
+      " [0.10480936]\n",
+      " [0.69489877]]\n",
+      "Iteration:  14\n",
+      "The rank vector: \n",
+      "[[0.12658795]\n",
+      " [0.07342161]\n",
+      " [0.10466324]\n",
+      " [0.69532719]]\n",
+      "Iteration:  15\n",
+      "The rank vector: \n",
+      "[[0.12646376]\n",
+      " [0.07336659]\n",
+      " [0.10457077]\n",
+      " [0.69559889]]\n",
+      "Iteration:  16\n",
+      "The rank vector: \n",
+      "[[0.12638516]\n",
+      " [0.0733314 ]\n",
+      " [0.1045122 ]\n",
+      " [0.69577125]]\n",
+      "Iteration:  17\n",
+      "The rank vector: \n",
+      "[[0.12633537]\n",
+      " [0.07330913]\n",
+      " [0.10447497]\n",
+      " [0.69588053]]\n",
+      "Iteration:  18\n",
+      "The rank vector: \n",
+      "[[0.12630373]\n",
+      " [0.07329502]\n",
+      " [0.1044514 ]\n",
+      " [0.69594985]]\n",
+      "Iteration:  19\n",
+      "The rank vector: \n",
+      "[[0.12628369]\n",
+      " [0.07328606]\n",
+      " [0.10443644]\n",
+      " [0.69599382]]\n",
+      "Iteration:  20\n",
+      "The rank vector: \n",
+      "[[0.12627097]\n",
+      " [0.07328038]\n",
+      " [0.10442695]\n",
+      " [0.6960217 ]]\n",
+      "Iteration:  21\n",
+      "The rank vector: \n",
+      "[[0.12626291]\n",
+      " [0.07327678]\n",
+      " [0.10442094]\n",
+      " [0.69603938]]\n",
+      "Iteration:  22\n",
+      "The rank vector: \n",
+      "[[0.1262578 ]\n",
+      " [0.07327449]\n",
+      " [0.10441712]\n",
+      " [0.69605059]]\n",
+      "Iteration:  23\n",
+      "The rank vector: \n",
+      "[[0.12625455]\n",
+      " [0.07327304]\n",
+      " [0.1044147 ]\n",
+      " [0.6960577 ]]\n",
+      "Iteration:  24\n",
+      "The rank vector: \n",
+      "[[0.1262525 ]\n",
+      " [0.07327212]\n",
+      " [0.10441317]\n",
+      " [0.69606221]]\n",
+      "Iteration:  25\n",
+      "The rank vector: \n",
+      "[[0.12625119]\n",
+      " [0.07327154]\n",
+      " [0.10441219]\n",
+      " [0.69606508]]\n",
+      "Iteration:  26\n",
+      "The rank vector: \n",
+      "[[0.12625036]\n",
+      " [0.07327117]\n",
+      " [0.10441158]\n",
+      " [0.69606689]]\n",
+      "Iteration:  27\n",
+      "The rank vector: \n",
+      "[[0.12624984]\n",
+      " [0.07327094]\n",
+      " [0.10441118]\n",
+      " [0.69606804]]\n",
+      "Iteration:  28\n",
+      "The rank vector: \n",
+      "[[0.12624951]\n",
+      " [0.07327079]\n",
+      " [0.10441094]\n",
+      " [0.69606877]]\n",
+      "Iteration:  29\n",
+      "The rank vector: \n",
+      "[[0.1262493 ]\n",
+      " [0.07327069]\n",
+      " [0.10441078]\n",
+      " [0.69606923]]\n",
+      "Iteration:  30\n",
+      "The rank vector: \n",
+      "[[0.12624916]\n",
+      " [0.07327063]\n",
+      " [0.10441068]\n",
+      " [0.69606953]]\n",
+      "Iteration:  31\n",
+      "The rank vector: \n",
+      "[[0.12624908]\n",
+      " [0.0732706 ]\n",
+      " [0.10441062]\n",
+      " [0.69606971]]\n",
+      "Iteration:  32\n",
+      "The rank vector: \n",
+      "[[0.12624902]\n",
+      " [0.07327057]\n",
+      " [0.10441057]\n",
+      " [0.69606983]]\n",
+      "Iteration:  33\n",
+      "The rank vector: \n",
+      "[[0.12624899]\n",
+      " [0.07327056]\n",
+      " [0.10441055]\n",
+      " [0.69606991]]\n",
+      "Iteration:  34\n",
+      "The rank vector: \n",
+      "[[0.12624897]\n",
+      " [0.07327055]\n",
+      " [0.10441053]\n",
+      " [0.69606995]]\n",
+      "Iteration:  35\n",
+      "The rank vector: \n",
+      "[[0.12624895]\n",
+      " [0.07327054]\n",
+      " [0.10441052]\n",
+      " [0.69606998]]\n",
+      "Iteration:  36\n",
+      "The rank vector: \n",
+      "[[0.12624894]\n",
+      " [0.07327054]\n",
+      " [0.10441052]\n",
+      " [0.69607   ]]\n",
+      "Iteration:  37\n",
+      "The rank vector: \n",
+      "[[0.12624894]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607001]]\n",
+      "Iteration:  38\n",
+      "The rank vector: \n",
+      "[[0.12624894]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607002]]\n",
+      "Iteration:  39\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  40\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  41\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  42\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  43\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  44\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  45\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  46\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  47\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607003]]\n",
+      "Iteration:  48\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  49\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  50\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  51\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  52\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  53\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  54\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  55\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  56\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  57\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  58\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  59\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  60\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  61\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  62\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  63\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  64\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n",
+      "Iteration:  65\n",
+      "The rank vector: \n",
+      "[[0.12624893]\n",
+      " [0.07327053]\n",
+      " [0.10441051]\n",
+      " [0.69607004]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# PageRank iterations\n",
+    "for i in range(1, 1001):\n",
+    "    print(\"Iteration: \", i)\n",
+    "    r = beta * np.matmul(M, r_prev) + uniformR\n",
+    "    print(\"The rank vector: \")\n",
+    "    print(r)\n",
+    "\n",
+    "    diff = np.sum(abs(r - r_prev))\n",
+    "    if diff < threshold:\n",
+    "        break\n",
+    "    r_prev = r.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The final rank vector: \n",
+      "[0.12624893 0.07327053 0.10441051 0.69607004]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Display the final rank vector\n",
+    "print(\"The final rank vector: \")\n",
+    "print(r[:, 0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bcbaa397-957c-4e79-b68a-e2070ee11baf",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e",
+   "metadata": {},
+   "source": [
+    "# Code-1.3\n",
+    "\n",
+    "Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n",
+    "\n",
+    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import libraries\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "81c78019-0857-4e4a-8235-8d2db97de214",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /home/nonroot/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Download NLTK Resources\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "58c711bf-c052-4314-8103-5f6ce43d41c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stop word removal function\n",
+    "def remove_stop_words(text):\n",
+    "    # Tokenizing the text into words\n",
+    "    words = word_tokenize(text)\n",
+    "    \n",
+    "    # Defining the English stop words\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    \n",
+    "    # Removing stop words from the text\n",
+    "    filtered_words = [word for word in words if word.lower() not in stop_words]\n",
+    "    \n",
+    "    return ' '.join(filtered_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fb409348-1737-48ac-baad-7a9024914b57",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original Text:\n",
+      "This is an example of a text document that needs stop word removal\n",
+      "\n",
+      "Preprocessed Text:\n",
+      "example text document needs stop word removal\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Main function\n",
+    "if __name__ == \"__main__\":\n",
+    "    input_text = \"This is an example of a text document that needs stop word removal\"\n",
+    "    preprocessed_text = remove_stop_words(input_text)\n",
+    "    print(\"Original Text:\")\n",
+    "    print(input_text)\n",
+    "    print(\"\\nPreprocessed Text:\")\n",
+    "    print(preprocessed_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54566bef-20a0-494b-9299-500417834bfd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
				`@@ -0,0 +1,2 @@`
				`This is a sample document. It contains text for testing the similarity.`
				`@@ -0,0 +1,2 @@`
				`This document is a sample. It includes text to test the similarity.`