diff --git a/Notebooks/Code-1.1/Code-1.1 (Document Similarity).ipynb b/Notebooks/Code-1.1/Code-1.1 (Document Similarity).ipynb new file mode 100644 index 0000000..1753a80 --- /dev/null +++ b/Notebooks/Code-1.1/Code-1.1 (Document Similarity).ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "946b7d7c-1e3a-4421-83ac-48c77a022c18", + "metadata": {}, + "source": [ + "# Practical-1.1\n", + "\n", + "Problem Statement: Write a program to Compute Similarity between two text documents.\n", + "\n", + "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "efe12052-a191-4760-9a75-a08d82b3d334", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "import numpy as np\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /home/nonroot/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt_tab to\n", + "[nltk_data] /home/nonroot/nltk_data...\n", + "[nltk_data] Package punkt_tab is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download necessary NLTK data\n", + "nltk.download(\"punkt\")\n", + "nltk.download(\"stopwords\")\n", + "nltk.download('punkt_tab')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contents of text1.txt:\n", + "This is a sample document. It contains text for testing the similarity.\n", + "\n", + "\n", + "Contents of text2.txt:\n", + "This document is a sample. It includes text to test the similarity.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "# Print contents of the two documents\n", + "def print_file_content(file):\n", + " with open(file, 'r') as f:\n", + " content = f.read()\n", + " print(content)\n", + "\n", + "print(\"Contents of text1.txt:\")\n", + "print_file_content(\"text1.txt\")\n", + "print(\"Contents of text2.txt:\")\n", + "print_file_content(\"text2.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392", + "metadata": {}, + "outputs": [], + "source": [ + "def process(file):\n", + " # Read the file\n", + " raw = open(file).read()\n", + " \n", + " # Tokenize the raw text\n", + " tokens = word_tokenize(raw)\n", + " words = [w.lower() for w in tokens]\n", + " \n", + " # Stem the tokens\n", + " porter = nltk.PorterStemmer()\n", + " stemmed_tokens = [porter.stem(t) for t in words]\n", + "\n", + " # Removing stop words\n", + " stop_words = set(stopwords.words('english'))\n", + " filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n", + " \n", + " # Count words\n", + " count = nltk.defaultdict(int)\n", + " for word in filtered_tokens:\n", + " count[word] += 1\n", + " \n", + " return count" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "56f17214-bc46-4eaf-aeed-ce387212c9b1", + "metadata": {}, + "outputs": [], + "source": [ + "def cos_sim(a, b):\n", + " dot_product = np.dot(a, b)\n", + " norm_a = np.linalg.norm(a)\n", + " norm_b = np.linalg.norm(b)\n", + " \n", + " return dot_product / (norm_a * norm_b)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6c1c3993-9909-4cb7-aaa5-a69714667afd", + "metadata": {}, + "outputs": [], + "source": [ + "def getSimilarity(dict1, dict2):\n", + " all_words_list = []\n", + " \n", + " # Collect all unique words from both dictionaries\n", + " for key in dict1:\n", + " all_words_list.append(key)\n", + " \n", + " for key in dict2:\n", + " all_words_list.append(key)\n", + " \n", + " all_words_list_size = len(all_words_list)\n", + " v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n", + " v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n", + " \n", + " # Create vectors for the dictionaries\n", + " for i, key in enumerate(all_words_list):\n", + " v1[i] = dict1.get(key, 0)\n", + " v2[i] = dict2.get(key, 0)\n", + " \n", + " return cos_sim(v1, v2)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity between two text documents: 0.9523809523809523\n" + ] + } + ], + "source": [ + "if __name__ == '__main__':\n", + " dict1 = process(\"text1.txt\")\n", + " dict2 = process(\"text2.txt\")\n", + " \n", + " print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))" + ] + }, + { + "cell_type": "markdown", + "id": "a32301be-d57c-4892-b0b3-094a05f61f9a", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.20" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebooks/Code-1.1/text1.txt b/Notebooks/Code-1.1/text1.txt new file mode 100644 index 0000000..87ede71 --- /dev/null +++ b/Notebooks/Code-1.1/text1.txt @@ -0,0 +1,2 @@ +This is a sample document. It contains text for testing the similarity. + diff --git a/Notebooks/Code-1.1/text2.txt b/Notebooks/Code-1.1/text2.txt new file mode 100644 index 0000000..1c5100a --- /dev/null +++ b/Notebooks/Code-1.1/text2.txt @@ -0,0 +1,2 @@ +This document is a sample. It includes text to test the similarity. + diff --git a/Notebooks/Code-1.2 (Page Rank Algorithm).ipynb b/Notebooks/Code-1.2 (Page Rank Algorithm).ipynb new file mode 100644 index 0000000..22414e4 --- /dev/null +++ b/Notebooks/Code-1.2 (Page Rank Algorithm).ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6", + "metadata": {}, + "source": [ + "# Practical-1.2\n", + "\n", + "Problem Statement: Implement Page Rank Algorithm.\n", + "\n", + "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76", + "metadata": {}, + "outputs": [], + "source": [ + "# Constants for PageRank\n", + "threshold = 1e-13\n", + "beta = 0.85" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "25966376-d37f-41ef-a1ca-adbdf5831bd3", + "metadata": {}, + "outputs": [], + "source": [ + "# Spider Trap Network represented as adjacency matrix\n", + "A = [\n", + " [0, 0, 1, 0],\n", + " [1, 0, 0, 0],\n", + " [1, 1, 0, 0],\n", + " [1, 1, 0, 1]\n", + "]\n", + "\n", + "# Convert adjacency matrix to a numpy array\n", + "arr = np.array(A, dtype=float)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summation of columns: [3.0, 2.0, 1.0, 1.0]\n" + ] + } + ], + "source": [ + "# Calculate summation of columns\n", + "s = []\n", + "for i in range(len(A)):\n", + " s.append(np.sum(arr[:, i]))\n", + "\n", + "print(\"Summation of columns: \", s)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5f41e472-4f23-4a83-ac92-737581dd566c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column stochastic probability matrix, M:\n", + "[[0. 0. 1. 0. ]\n", + " [0.33333333 0. 0. 0. ]\n", + " [0.33333333 0.5 0. 0. ]\n", + " [0.33333333 0.5 0. 1. ]]\n" + ] + } + ], + "source": [ + "# Create the column stochastic probability matrix, M\n", + "M = arr.copy()\n", + "for j in range(len(A)):\n", + " if s[j] != 0: # Prevent division by zero\n", + " M[:, j] = M[:, j] / s[j]\n", + "\n", + "print(\"Column stochastic probability matrix, M:\")\n", + "print(M)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial rank vector:\n", + "[[0.25]\n", + " [0.25]\n", + " [0.25]\n", + " [0.25]]\n" + ] + } + ], + "source": [ + "# Initialize rank vector\n", + "r = (1.0 + np.zeros([len(M), 1])) / len(M)\n", + "print(\"Initial rank vector:\")\n", + "print(r)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the uniform rank contribution\n", + "uniformR = (1.0 - beta) * r\n", + "r_prev = r.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration: 1\n", + "The rank vector: \n", + "[[0.25 ]\n", + " [0.10833333]\n", + " [0.21458333]\n", + " [0.42708333]]\n", + "Iteration: 2\n", + "The rank vector: \n", + "[[0.21989583]\n", + " [0.10833333]\n", + " [0.154375 ]\n", + " [0.51739583]]\n", + "Iteration: 3\n", + "The rank vector: \n", + "[[0.16871875]\n", + " [0.09980382]\n", + " [0.14584549]\n", + " [0.58563194]]\n", + "Iteration: 4\n", + "The rank vector: \n", + "[[0.16146866]\n", + " [0.08530365]\n", + " [0.12772027]\n", + " [0.62550742]]\n", + "Iteration: 5\n", + "The rank vector: \n", + "[[0.14606223]\n", + " [0.08324945]\n", + " [0.1195035 ]\n", + " [0.65118481]]\n", + "Iteration: 6\n", + "The rank vector: \n", + "[[0.13907798]\n", + " [0.0788843 ]\n", + " [0.11426532]\n", + " [0.66777241]]\n", + "Iteration: 7\n", + "The rank vector: \n", + "[[0.13462552]\n", + " [0.07690543]\n", + " [0.11043125]\n", + " [0.6780378 ]]\n", + "Iteration: 8\n", + "The rank vector: \n", + "[[0.13136657]\n", + " [0.0756439 ]\n", + " [0.1083287 ]\n", + " [0.68466083]]\n", + "Iteration: 9\n", + "The rank vector: \n", + "[[0.1295794 ]\n", + " [0.07472053]\n", + " [0.10686918]\n", + " [0.68883089]]\n", + "Iteration: 10\n", + "The rank vector: \n", + "[[0.12833881]\n", + " [0.07421416]\n", + " [0.10597039]\n", + " [0.69147664]]\n", + "Iteration: 11\n", + "The rank vector: \n", + "[[0.12757483]\n", + " [0.07386266]\n", + " [0.10540368]\n", + " [0.69315883]]\n", + "Iteration: 12\n", + "The rank vector: \n", + "[[0.12709313]\n", + " [0.0736462 ]\n", + " [0.10503783]\n", + " [0.69422284]]\n", + "Iteration: 13\n", + "The rank vector: \n", + "[[0.12678216]\n", + " [0.07350972]\n", + " [0.10480936]\n", + " [0.69489877]]\n", + "Iteration: 14\n", + "The rank vector: \n", + "[[0.12658795]\n", + " [0.07342161]\n", + " [0.10466324]\n", + " [0.69532719]]\n", + "Iteration: 15\n", + "The rank vector: \n", + "[[0.12646376]\n", + " [0.07336659]\n", + " [0.10457077]\n", + " [0.69559889]]\n", + "Iteration: 16\n", + "The rank vector: \n", + "[[0.12638516]\n", + " [0.0733314 ]\n", + " [0.1045122 ]\n", + " [0.69577125]]\n", + "Iteration: 17\n", + "The rank vector: \n", + "[[0.12633537]\n", + " [0.07330913]\n", + " [0.10447497]\n", + " [0.69588053]]\n", + "Iteration: 18\n", + "The rank vector: \n", + "[[0.12630373]\n", + " [0.07329502]\n", + " [0.1044514 ]\n", + " [0.69594985]]\n", + "Iteration: 19\n", + "The rank vector: \n", + "[[0.12628369]\n", + " [0.07328606]\n", + " [0.10443644]\n", + " [0.69599382]]\n", + "Iteration: 20\n", + "The rank vector: \n", + "[[0.12627097]\n", + " [0.07328038]\n", + " [0.10442695]\n", + " [0.6960217 ]]\n", + "Iteration: 21\n", + "The rank vector: \n", + "[[0.12626291]\n", + " [0.07327678]\n", + " [0.10442094]\n", + " [0.69603938]]\n", + "Iteration: 22\n", + "The rank vector: \n", + "[[0.1262578 ]\n", + " [0.07327449]\n", + " [0.10441712]\n", + " [0.69605059]]\n", + "Iteration: 23\n", + "The rank vector: \n", + "[[0.12625455]\n", + " [0.07327304]\n", + " [0.1044147 ]\n", + " [0.6960577 ]]\n", + "Iteration: 24\n", + "The rank vector: \n", + "[[0.1262525 ]\n", + " [0.07327212]\n", + " [0.10441317]\n", + " [0.69606221]]\n", + "Iteration: 25\n", + "The rank vector: \n", + "[[0.12625119]\n", + " [0.07327154]\n", + " [0.10441219]\n", + " [0.69606508]]\n", + "Iteration: 26\n", + "The rank vector: \n", + "[[0.12625036]\n", + " [0.07327117]\n", + " [0.10441158]\n", + " [0.69606689]]\n", + "Iteration: 27\n", + "The rank vector: \n", + "[[0.12624984]\n", + " [0.07327094]\n", + " [0.10441118]\n", + " [0.69606804]]\n", + "Iteration: 28\n", + "The rank vector: \n", + "[[0.12624951]\n", + " [0.07327079]\n", + " [0.10441094]\n", + " [0.69606877]]\n", + "Iteration: 29\n", + "The rank vector: \n", + "[[0.1262493 ]\n", + " [0.07327069]\n", + " [0.10441078]\n", + " [0.69606923]]\n", + "Iteration: 30\n", + "The rank vector: \n", + "[[0.12624916]\n", + " [0.07327063]\n", + " [0.10441068]\n", + " [0.69606953]]\n", + "Iteration: 31\n", + "The rank vector: \n", + "[[0.12624908]\n", + " [0.0732706 ]\n", + " [0.10441062]\n", + " [0.69606971]]\n", + "Iteration: 32\n", + "The rank vector: \n", + "[[0.12624902]\n", + " [0.07327057]\n", + " [0.10441057]\n", + " [0.69606983]]\n", + "Iteration: 33\n", + "The rank vector: \n", + "[[0.12624899]\n", + " [0.07327056]\n", + " [0.10441055]\n", + " [0.69606991]]\n", + "Iteration: 34\n", + "The rank vector: \n", + "[[0.12624897]\n", + " [0.07327055]\n", + " [0.10441053]\n", + " [0.69606995]]\n", + "Iteration: 35\n", + "The rank vector: \n", + "[[0.12624895]\n", + " [0.07327054]\n", + " [0.10441052]\n", + " [0.69606998]]\n", + "Iteration: 36\n", + "The rank vector: \n", + "[[0.12624894]\n", + " [0.07327054]\n", + " [0.10441052]\n", + " [0.69607 ]]\n", + "Iteration: 37\n", + "The rank vector: \n", + "[[0.12624894]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607001]]\n", + "Iteration: 38\n", + "The rank vector: \n", + "[[0.12624894]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607002]]\n", + "Iteration: 39\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 40\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 41\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 42\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 43\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 44\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 45\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 46\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 47\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607003]]\n", + "Iteration: 48\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 49\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 50\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 51\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 52\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 53\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 54\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 55\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 56\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 57\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 58\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 59\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 60\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 61\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 62\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 63\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 64\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n", + "Iteration: 65\n", + "The rank vector: \n", + "[[0.12624893]\n", + " [0.07327053]\n", + " [0.10441051]\n", + " [0.69607004]]\n" + ] + } + ], + "source": [ + "# PageRank iterations\n", + "for i in range(1, 1001):\n", + " print(\"Iteration: \", i)\n", + " r = beta * np.matmul(M, r_prev) + uniformR\n", + " print(\"The rank vector: \")\n", + " print(r)\n", + "\n", + " diff = np.sum(abs(r - r_prev))\n", + " if diff < threshold:\n", + " break\n", + " r_prev = r.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The final rank vector: \n", + "[0.12624893 0.07327053 0.10441051 0.69607004]\n" + ] + } + ], + "source": [ + "# Display the final rank vector\n", + "print(\"The final rank vector: \")\n", + "print(r[:, 0])" + ] + }, + { + "cell_type": "markdown", + "id": "bcbaa397-957c-4e79-b68a-e2070ee11baf", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.20" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebooks/Code-1.3 (Stopword Removal).ipynb b/Notebooks/Code-1.3 (Stopword Removal).ipynb new file mode 100644 index 0000000..4aadcb7 --- /dev/null +++ b/Notebooks/Code-1.3 (Stopword Removal).ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e", + "metadata": {}, + "source": [ + "# Code-1.3\n", + "\n", + "Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n", + "\n", + "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "81c78019-0857-4e4a-8235-8d2db97de214", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /home/nonroot/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download NLTK Resources\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "58c711bf-c052-4314-8103-5f6ce43d41c0", + "metadata": {}, + "outputs": [], + "source": [ + "# Stop word removal function\n", + "def remove_stop_words(text):\n", + " # Tokenizing the text into words\n", + " words = word_tokenize(text)\n", + " \n", + " # Defining the English stop words\n", + " stop_words = set(stopwords.words('english'))\n", + " \n", + " # Removing stop words from the text\n", + " filtered_words = [word for word in words if word.lower() not in stop_words]\n", + " \n", + " return ' '.join(filtered_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fb409348-1737-48ac-baad-7a9024914b57", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Text:\n", + "This is an example of a text document that needs stop word removal\n", + "\n", + "Preprocessed Text:\n", + "example text document needs stop word removal\n" + ] + } + ], + "source": [ + "# Main function\n", + "if __name__ == \"__main__\":\n", + " input_text = \"This is an example of a text document that needs stop word removal\"\n", + " preprocessed_text = remove_stop_words(input_text)\n", + " print(\"Original Text:\")\n", + " print(input_text)\n", + " print(\"\\nPreprocessed Text:\")\n", + " print(preprocessed_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54566bef-20a0-494b-9299-500417834bfd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.20" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}