Added jupyter notebooks for 1.1, 1.2, 1.3.

This commit is contained in:
K
2025-10-12 22:55:54 +05:30
parent a4f5326402
commit 68587b84e0
5 changed files with 994 additions and 0 deletions
@@ -0,0 +1,230 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
"metadata": {},
"source": [
"# Practical-1.1\n",
"\n",
"Problem Statement: Write a program to Compute Similarity between two text documents.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "efe12052-a191-4760-9a75-a08d82b3d334",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import numpy as np\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download necessary NLTK data\n",
"nltk.download(\"punkt\")\n",
"nltk.download(\"stopwords\")\n",
"nltk.download('punkt_tab')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Contents of text1.txt:\n",
"This is a sample document. It contains text for testing the similarity.\n",
"\n",
"\n",
"Contents of text2.txt:\n",
"This document is a sample. It includes text to test the similarity.\n",
"\n",
"\n"
]
}
],
"source": [
"# Print contents of the two documents\n",
"def print_file_content(file):\n",
" with open(file, 'r') as f:\n",
" content = f.read()\n",
" print(content)\n",
"\n",
"print(\"Contents of text1.txt:\")\n",
"print_file_content(\"text1.txt\")\n",
"print(\"Contents of text2.txt:\")\n",
"print_file_content(\"text2.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
"metadata": {},
"outputs": [],
"source": [
"def process(file):\n",
" # Read the file\n",
" raw = open(file).read()\n",
" \n",
" # Tokenize the raw text\n",
" tokens = word_tokenize(raw)\n",
" words = [w.lower() for w in tokens]\n",
" \n",
" # Stem the tokens\n",
" porter = nltk.PorterStemmer()\n",
" stemmed_tokens = [porter.stem(t) for t in words]\n",
"\n",
" # Removing stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
" \n",
" # Count words\n",
" count = nltk.defaultdict(int)\n",
" for word in filtered_tokens:\n",
" count[word] += 1\n",
" \n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
"metadata": {},
"outputs": [],
"source": [
"def cos_sim(a, b):\n",
" dot_product = np.dot(a, b)\n",
" norm_a = np.linalg.norm(a)\n",
" norm_b = np.linalg.norm(b)\n",
" \n",
" return dot_product / (norm_a * norm_b)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
"metadata": {},
"outputs": [],
"source": [
"def getSimilarity(dict1, dict2):\n",
" all_words_list = []\n",
" \n",
" # Collect all unique words from both dictionaries\n",
" for key in dict1:\n",
" all_words_list.append(key)\n",
" \n",
" for key in dict2:\n",
" all_words_list.append(key)\n",
" \n",
" all_words_list_size = len(all_words_list)\n",
" v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" \n",
" # Create vectors for the dictionaries\n",
" for i, key in enumerate(all_words_list):\n",
" v1[i] = dict1.get(key, 0)\n",
" v2[i] = dict2.get(key, 0)\n",
" \n",
" return cos_sim(v1, v2)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Similarity between two text documents: 0.9523809523809523\n"
]
}
],
"source": [
"if __name__ == '__main__':\n",
" dict1 = process(\"text1.txt\")\n",
" dict2 = process(\"text2.txt\")\n",
" \n",
" print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
]
},
{
"cell_type": "markdown",
"id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+2
View File
@@ -0,0 +1,2 @@
This is a sample document. It contains text for testing the similarity.
+2
View File
@@ -0,0 +1,2 @@
This document is a sample. It includes text to test the similarity.
@@ -0,0 +1,616 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6",
"metadata": {},
"source": [
"# Practical-1.2\n",
"\n",
"Problem Statement: Implement Page Rank Algorithm.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76",
"metadata": {},
"outputs": [],
"source": [
"# Constants for PageRank\n",
"threshold = 1e-13\n",
"beta = 0.85"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "25966376-d37f-41ef-a1ca-adbdf5831bd3",
"metadata": {},
"outputs": [],
"source": [
"# Spider Trap Network represented as adjacency matrix\n",
"A = [\n",
" [0, 0, 1, 0],\n",
" [1, 0, 0, 0],\n",
" [1, 1, 0, 0],\n",
" [1, 1, 0, 1]\n",
"]\n",
"\n",
"# Convert adjacency matrix to a numpy array\n",
"arr = np.array(A, dtype=float)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Summation of columns: [3.0, 2.0, 1.0, 1.0]\n"
]
}
],
"source": [
"# Calculate summation of columns\n",
"s = []\n",
"for i in range(len(A)):\n",
" s.append(np.sum(arr[:, i]))\n",
"\n",
"print(\"Summation of columns: \", s)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "5f41e472-4f23-4a83-ac92-737581dd566c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column stochastic probability matrix, M:\n",
"[[0. 0. 1. 0. ]\n",
" [0.33333333 0. 0. 0. ]\n",
" [0.33333333 0.5 0. 0. ]\n",
" [0.33333333 0.5 0. 1. ]]\n"
]
}
],
"source": [
"# Create the column stochastic probability matrix, M\n",
"M = arr.copy()\n",
"for j in range(len(A)):\n",
" if s[j] != 0: # Prevent division by zero\n",
" M[:, j] = M[:, j] / s[j]\n",
"\n",
"print(\"Column stochastic probability matrix, M:\")\n",
"print(M)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial rank vector:\n",
"[[0.25]\n",
" [0.25]\n",
" [0.25]\n",
" [0.25]]\n"
]
}
],
"source": [
"# Initialize rank vector\n",
"r = (1.0 + np.zeros([len(M), 1])) / len(M)\n",
"print(\"Initial rank vector:\")\n",
"print(r)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the uniform rank contribution\n",
"uniformR = (1.0 - beta) * r\n",
"r_prev = r.copy()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration: 1\n",
"The rank vector: \n",
"[[0.25 ]\n",
" [0.10833333]\n",
" [0.21458333]\n",
" [0.42708333]]\n",
"Iteration: 2\n",
"The rank vector: \n",
"[[0.21989583]\n",
" [0.10833333]\n",
" [0.154375 ]\n",
" [0.51739583]]\n",
"Iteration: 3\n",
"The rank vector: \n",
"[[0.16871875]\n",
" [0.09980382]\n",
" [0.14584549]\n",
" [0.58563194]]\n",
"Iteration: 4\n",
"The rank vector: \n",
"[[0.16146866]\n",
" [0.08530365]\n",
" [0.12772027]\n",
" [0.62550742]]\n",
"Iteration: 5\n",
"The rank vector: \n",
"[[0.14606223]\n",
" [0.08324945]\n",
" [0.1195035 ]\n",
" [0.65118481]]\n",
"Iteration: 6\n",
"The rank vector: \n",
"[[0.13907798]\n",
" [0.0788843 ]\n",
" [0.11426532]\n",
" [0.66777241]]\n",
"Iteration: 7\n",
"The rank vector: \n",
"[[0.13462552]\n",
" [0.07690543]\n",
" [0.11043125]\n",
" [0.6780378 ]]\n",
"Iteration: 8\n",
"The rank vector: \n",
"[[0.13136657]\n",
" [0.0756439 ]\n",
" [0.1083287 ]\n",
" [0.68466083]]\n",
"Iteration: 9\n",
"The rank vector: \n",
"[[0.1295794 ]\n",
" [0.07472053]\n",
" [0.10686918]\n",
" [0.68883089]]\n",
"Iteration: 10\n",
"The rank vector: \n",
"[[0.12833881]\n",
" [0.07421416]\n",
" [0.10597039]\n",
" [0.69147664]]\n",
"Iteration: 11\n",
"The rank vector: \n",
"[[0.12757483]\n",
" [0.07386266]\n",
" [0.10540368]\n",
" [0.69315883]]\n",
"Iteration: 12\n",
"The rank vector: \n",
"[[0.12709313]\n",
" [0.0736462 ]\n",
" [0.10503783]\n",
" [0.69422284]]\n",
"Iteration: 13\n",
"The rank vector: \n",
"[[0.12678216]\n",
" [0.07350972]\n",
" [0.10480936]\n",
" [0.69489877]]\n",
"Iteration: 14\n",
"The rank vector: \n",
"[[0.12658795]\n",
" [0.07342161]\n",
" [0.10466324]\n",
" [0.69532719]]\n",
"Iteration: 15\n",
"The rank vector: \n",
"[[0.12646376]\n",
" [0.07336659]\n",
" [0.10457077]\n",
" [0.69559889]]\n",
"Iteration: 16\n",
"The rank vector: \n",
"[[0.12638516]\n",
" [0.0733314 ]\n",
" [0.1045122 ]\n",
" [0.69577125]]\n",
"Iteration: 17\n",
"The rank vector: \n",
"[[0.12633537]\n",
" [0.07330913]\n",
" [0.10447497]\n",
" [0.69588053]]\n",
"Iteration: 18\n",
"The rank vector: \n",
"[[0.12630373]\n",
" [0.07329502]\n",
" [0.1044514 ]\n",
" [0.69594985]]\n",
"Iteration: 19\n",
"The rank vector: \n",
"[[0.12628369]\n",
" [0.07328606]\n",
" [0.10443644]\n",
" [0.69599382]]\n",
"Iteration: 20\n",
"The rank vector: \n",
"[[0.12627097]\n",
" [0.07328038]\n",
" [0.10442695]\n",
" [0.6960217 ]]\n",
"Iteration: 21\n",
"The rank vector: \n",
"[[0.12626291]\n",
" [0.07327678]\n",
" [0.10442094]\n",
" [0.69603938]]\n",
"Iteration: 22\n",
"The rank vector: \n",
"[[0.1262578 ]\n",
" [0.07327449]\n",
" [0.10441712]\n",
" [0.69605059]]\n",
"Iteration: 23\n",
"The rank vector: \n",
"[[0.12625455]\n",
" [0.07327304]\n",
" [0.1044147 ]\n",
" [0.6960577 ]]\n",
"Iteration: 24\n",
"The rank vector: \n",
"[[0.1262525 ]\n",
" [0.07327212]\n",
" [0.10441317]\n",
" [0.69606221]]\n",
"Iteration: 25\n",
"The rank vector: \n",
"[[0.12625119]\n",
" [0.07327154]\n",
" [0.10441219]\n",
" [0.69606508]]\n",
"Iteration: 26\n",
"The rank vector: \n",
"[[0.12625036]\n",
" [0.07327117]\n",
" [0.10441158]\n",
" [0.69606689]]\n",
"Iteration: 27\n",
"The rank vector: \n",
"[[0.12624984]\n",
" [0.07327094]\n",
" [0.10441118]\n",
" [0.69606804]]\n",
"Iteration: 28\n",
"The rank vector: \n",
"[[0.12624951]\n",
" [0.07327079]\n",
" [0.10441094]\n",
" [0.69606877]]\n",
"Iteration: 29\n",
"The rank vector: \n",
"[[0.1262493 ]\n",
" [0.07327069]\n",
" [0.10441078]\n",
" [0.69606923]]\n",
"Iteration: 30\n",
"The rank vector: \n",
"[[0.12624916]\n",
" [0.07327063]\n",
" [0.10441068]\n",
" [0.69606953]]\n",
"Iteration: 31\n",
"The rank vector: \n",
"[[0.12624908]\n",
" [0.0732706 ]\n",
" [0.10441062]\n",
" [0.69606971]]\n",
"Iteration: 32\n",
"The rank vector: \n",
"[[0.12624902]\n",
" [0.07327057]\n",
" [0.10441057]\n",
" [0.69606983]]\n",
"Iteration: 33\n",
"The rank vector: \n",
"[[0.12624899]\n",
" [0.07327056]\n",
" [0.10441055]\n",
" [0.69606991]]\n",
"Iteration: 34\n",
"The rank vector: \n",
"[[0.12624897]\n",
" [0.07327055]\n",
" [0.10441053]\n",
" [0.69606995]]\n",
"Iteration: 35\n",
"The rank vector: \n",
"[[0.12624895]\n",
" [0.07327054]\n",
" [0.10441052]\n",
" [0.69606998]]\n",
"Iteration: 36\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327054]\n",
" [0.10441052]\n",
" [0.69607 ]]\n",
"Iteration: 37\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607001]]\n",
"Iteration: 38\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607002]]\n",
"Iteration: 39\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 40\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 41\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 42\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 43\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 44\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 45\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 46\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 47\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 48\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 49\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 50\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 51\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 52\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 53\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 54\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 55\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 56\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 57\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 58\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 59\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 60\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 61\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 62\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 63\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 64\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 65\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n"
]
}
],
"source": [
"# PageRank iterations\n",
"for i in range(1, 1001):\n",
" print(\"Iteration: \", i)\n",
" r = beta * np.matmul(M, r_prev) + uniformR\n",
" print(\"The rank vector: \")\n",
" print(r)\n",
"\n",
" diff = np.sum(abs(r - r_prev))\n",
" if diff < threshold:\n",
" break\n",
" r_prev = r.copy()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The final rank vector: \n",
"[0.12624893 0.07327053 0.10441051 0.69607004]\n"
]
}
],
"source": [
"# Display the final rank vector\n",
"print(\"The final rank vector: \")\n",
"print(r[:, 0])"
]
},
{
"cell_type": "markdown",
"id": "bcbaa397-957c-4e79-b68a-e2070ee11baf",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+144
View File
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e",
"metadata": {},
"source": [
"# Code-1.3\n",
"\n",
"Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "81c78019-0857-4e4a-8235-8d2db97de214",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download NLTK Resources\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "58c711bf-c052-4314-8103-5f6ce43d41c0",
"metadata": {},
"outputs": [],
"source": [
"# Stop word removal function\n",
"def remove_stop_words(text):\n",
" # Tokenizing the text into words\n",
" words = word_tokenize(text)\n",
" \n",
" # Defining the English stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" \n",
" # Removing stop words from the text\n",
" filtered_words = [word for word in words if word.lower() not in stop_words]\n",
" \n",
" return ' '.join(filtered_words)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fb409348-1737-48ac-baad-7a9024914b57",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original Text:\n",
"This is an example of a text document that needs stop word removal\n",
"\n",
"Preprocessed Text:\n",
"example text document needs stop word removal\n"
]
}
],
"source": [
"# Main function\n",
"if __name__ == \"__main__\":\n",
" input_text = \"This is an example of a text document that needs stop word removal\"\n",
" preprocessed_text = remove_stop_words(input_text)\n",
" print(\"Original Text:\")\n",
" print(input_text)\n",
" print(\"\\nPreprocessed Text:\")\n",
" print(preprocessed_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54566bef-20a0-494b-9299-500417834bfd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}