{ "cells": [ { "cell_type": "markdown", "id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e", "metadata": {}, "source": [ "# Code-1.3\n", "\n", "Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n", "\n", "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n", "\n", "---" ] }, { "cell_type": "code", "execution_count": 1, "id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900", "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize" ] }, { "cell_type": "code", "execution_count": 2, "id": "81c78019-0857-4e4a-8235-8d2db97de214", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /home/nonroot/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Download NLTK Resources\n", "nltk.download('punkt')\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": 3, "id": "58c711bf-c052-4314-8103-5f6ce43d41c0", "metadata": {}, "outputs": [], "source": [ "# Stop word removal function\n", "def remove_stop_words(text):\n", " # Tokenizing the text into words\n", " words = word_tokenize(text)\n", " \n", " # Defining the English stop words\n", " stop_words = set(stopwords.words('english'))\n", " \n", " # Removing stop words from the text\n", " filtered_words = [word for word in words if word.lower() not in stop_words]\n", " \n", " return ' '.join(filtered_words)" ] }, { "cell_type": "code", "execution_count": 5, "id": "fb409348-1737-48ac-baad-7a9024914b57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original Text:\n", "This is an example of a text document that needs stop word removal\n", "\n", "Preprocessed Text:\n", "example text document needs stop word removal\n" ] } ], "source": [ "# Main function\n", "if __name__ == \"__main__\":\n", " input_text = \"This is an example of a text document that needs stop word removal\"\n", " preprocessed_text = remove_stop_words(input_text)\n", " print(\"Original Text:\")\n", " print(input_text)\n", " print(\"\\nPreprocessed Text:\")\n", " print(preprocessed_text)" ] }, { "cell_type": "code", "execution_count": null, "id": "54566bef-20a0-494b-9299-500417834bfd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }