{ "cells": [ { "cell_type": "markdown", "id": "6ed65106-23d2-4261-bf81-2a5b4b5ec60e", "metadata": {}, "source": [ "# Notebook-A7 (Text Analytics)" ] }, { "cell_type": "code", "execution_count": 15, "id": "127a35f5-2434-4f6a-8904-edb4fb4f6f29", "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "import nltk\n", "from nltk.tokenize import *\n", "from nltk.corpus import *\n", "from nltk.stem import *\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "id": "555c6d33-75bb-4033-a9fa-60e145527464", "metadata": {}, "outputs": [], "source": [ "# Download resources\n", "nltk.download('all') # WARNING: ABOUT 2GBs\n", "\n", "\"\"\"\n", "OR YOU COULD DOWNLOAD ONLY SPECIFIC RESOURCES\n", "nltk.download('punkt') # For splitting text into sentences or words\n", "nltk.download('stopwords') # Common stop words\n", "nltk.download('wordnet') # Synonyms\n", "nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger\n", "nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 17, "id": "ba67d90c-5711-496a-bf81-a5aef68a01bd", "metadata": {}, "outputs": [], "source": [ "# Write text to perform preprocessing on\n", "text = \"Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim.\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "fa8d4d18-ba91-4ced-9522-849be18aba6a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello everyone!', 'I am first name last name.', 'I am a loyal KSKA Git user all the way from Sangamwadi Empire.', 'I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain.', 'For every smart contract, I lose one strand of my hair.', 'In my free time, which by the way, I barely get, I like to swim.']\n" ] } ], "source": [ "# Sentence tokenization\n", "var1 = sent_tokenize(text)\n", "print(var1)" ] }, { "cell_type": "code", "execution_count": 19, "id": "a53cc954-e60f-41b8-8e15-09fdc5b80328", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Hello', 'everyone', '!', 'I', 'am', 'first', 'name', 'last', 'name', '.', 'I', 'am', 'a', 'loyal', 'KSKA', 'Git', 'user', 'all', 'the', 'way', 'from', 'Sangamwadi', 'Empire', '.', 'I', 'have', 'considerable', 'knowledge', 'about', 'life', ',', 'Python', ',', 'C++', ',', 'Java', ',', 'Rust', ',', 'Golang', 'and', 'Blockchain', '.', 'For', 'every', 'smart', 'contract', ',', 'I', 'lose', 'one', 'strand', 'of', 'my', 'hair', '.', 'In', 'my', 'free', 'time', ',', 'which', 'by', 'the', 'way', ',', 'I', 'barely', 'get', ',', 'I', 'like', 'to', 'swim', '.']\n" ] } ], "source": [ "# Word tokenization\n", "var2 = word_tokenize(text)\n", "print(var2)" ] }, { "cell_type": "code", "execution_count": 20, "id": "f259c7c3-9e94-42cb-bb94-a81176dc3126", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "After removing punctuation from text:\n", " Hello everyone I am first name last name I am a loyal KSKA Git user all the way from Sangamwadi Empire I have considerable knowledge about life Python C Java Rust Golang and Blockchain For every smart contract I lose one strand of my hair In my free time which by the way I barely get I like to swim \n" ] } ], "source": [ "# Removing punctuation\n", "text = re.sub('[^a-zA-Z]',' ',text)\n", "print(\"After removing punctuation from text:\\n\", text)" ] }, { "cell_type": "code", "execution_count": 21, "id": "092b3e63-3161-4b23-be8d-ff83a829205f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Stop words:\n", " {'do', 'most', 'more', 'am', 'aren', 'other', \"shouldn't\", 's', 'now', 'again', 'here', 'off', \"we're\", 'during', 'haven', 'above', \"we've\", 'our', \"he'll\", 'whom', 'ain', 'is', \"she'll\", 'once', \"that'll\", \"needn't\", 'shan', 'weren', 'been', 'doing', 'wasn', 'needn', 'any', 'not', \"aren't\", \"won't\", 'myself', 'couldn', 'by', 'were', 'no', \"he's\", \"shan't\", 'very', \"i'd\", 'y', 'm', 'your', 'against', 'are', 'she', \"hasn't\", \"she'd\", \"you'll\", 'because', 'mightn', 'their', \"they'd\", 'nor', 'having', 'into', 'so', \"it's\", 'don', 'who', \"haven't\", 'his', 'what', 'why', 'we', 'i', \"i'm\", 'hadn', 'over', 'and', 'her', 'to', 'ma', 'a', 'it', \"isn't\", 'under', 'o', 'until', 'an', 'same', 'them', 'did', \"they're\", 'ourselves', 'as', 'its', \"wasn't\", 'doesn', 'just', 'yourselves', 'll', 'down', 'itself', \"i've\", 'should', 'shouldn', \"mightn't\", 'on', 'these', 'or', 'only', 'd', 'hasn', 'about', 'wouldn', \"couldn't\", 're', 'mustn', 'with', \"you'd\", 'few', 'in', 'the', 'out', \"don't\", 'him', \"wouldn't\", 'can', 'through', 'from', 'those', 'for', 'didn', 'you', 'below', 'up', 'themselves', \"didn't\", 'too', 'being', 'of', 'further', 'some', \"we'd\", \"i'll\", \"it'll\", 'while', \"doesn't\", \"mustn't\", 'that', 've', 'if', 'be', 'yourself', 'he', \"hadn't\", 'how', 'than', 'was', 'will', 'before', 'my', 't', 'theirs', 'at', \"weren't\", \"should've\", 'won', \"you're\", 'own', 'isn', \"you've\", 'such', 'himself', \"she's\", 'all', 'me', 'but', \"they'll\", \"he'd\", 'after', \"they've\", 'then', 'this', 'both', 'hers', 'herself', 'ours', \"it'd\", 'which', 'where', \"we'll\", 'each', 'between', 'there', 'yours', 'had', 'have', 'has', 'when', 'does', 'they'}\n", "==============================================================\n", "Tokenized Sentence:\n", " ['hello', 'everyone', 'i', 'am', 'first', 'name', 'last', 'name', 'i', 'am', 'a', 'loyal', 'kska', 'git', 'user', 'all', 'the', 'way', 'from', 'sangamwadi', 'empire', 'i', 'have', 'considerable', 'knowledge', 'about', 'life', 'python', 'c', 'java', 'rust', 'golang', 'and', 'blockchain', 'for', 'every', 'smart', 'contract', 'i', 'lose', 'one', 'strand', 'of', 'my', 'hair', 'in', 'my', 'free', 'time', 'which', 'by', 'the', 'way', 'i', 'barely', 'get', 'i', 'like', 'to', 'swim']\n", "\n", "Filtered Sentence:\n", " ['hello', 'everyone', 'first', 'name', 'last', 'name', 'loyal', 'kska', 'git', 'user', 'way', 'sangamwadi', 'empire', 'considerable', 'knowledge', 'life', 'python', 'c', 'java', 'rust', 'golang', 'blockchain', 'every', 'smart', 'contract', 'lose', 'one', 'strand', 'hair', 'free', 'time', 'way', 'barely', 'get', 'like', 'swim']\n" ] } ], "source": [ "# Removing stop words\n", "var3 = set(stopwords.words('english'))\n", "print(\"Stop words:\\n\", var3)\n", "print(\"==============================================================\")\n", "tokens = word_tokenize(text.lower())\n", "filtered_text = []\n", "for word in tokens:\n", " if word not in var3:\n", " filtered_text.append(word)\n", "print(\"Tokenized Sentence:\\n\", tokens)\n", "print(\"\\nFiltered Sentence:\\n\", filtered_text)" ] }, { "cell_type": "code", "execution_count": 22, "id": "1b6d55c9-5724-4abb-bcb2-4fc5d27cbe12", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "write\n", "write\n", "wrote\n", "write\n", "read\n", "read\n" ] } ], "source": [ "# Stemmatization\n", "var = [\"write\", \"writing\", \"wrote\", \"writes\",\"reading\",\"reads\"]\n", "ps = PorterStemmer() # brings word to its root form\n", "for w in var:\n", " root_word = ps.stem(w)\n", " print(root_word)" ] }, { "cell_type": "code", "execution_count": 23, "id": "6e8f62c1-d4ae-48a8-8d3e-a86366ed7972", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text is:\t ['studies', 'studying', 'cries', 'cry']\n", "Lemma for studies is study\n", "Lemma for studying is studying\n", "Lemma for cries is cry\n", "Lemma for cry is cry\n" ] } ], "source": [ "# Lemmatization\n", "wordnet_lemmatizer = WordNetLemmatizer()\n", "text = \"studies studying cries cry\"\n", "tt = nltk.word_tokenize(text)\n", "print(\"Text is:\\t\", tt)\n", "for w in tt:\n", " print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w)))" ] }, { "cell_type": "code", "execution_count": 24, "id": "c2ff8017-d03a-412f-b1f8-e2fb0b70bfca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Hello', 'NNP'),\n", " ('everyone', 'NN'),\n", " ('this', 'DT'),\n", " ('is', 'VBZ'),\n", " ('a', 'DT'),\n", " ('sample', 'JJ'),\n", " ('text', 'NN'),\n", " ('!', '.'),\n", " ('Earth', 'NN'),\n", " ('.', '.')]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# POS Tagging\n", "text = \"Hello everyone this is a sample text! Earth.\"\n", "text = nltk.word_tokenize(text)\n", "nltk.pos_tag(text)" ] }, { "cell_type": "code", "execution_count": 25, "id": "1d71007a-a8cb-45ab-af27-ec69b6826ddd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF-IDF\n", "['an' 'example' 'frequency' 'is' 'meow' 'of' 'term' 'this'] [[0.1767767 0.1767767 0.1767767 0.1767767 0.88388348 0.1767767\n", " 0.1767767 0.1767767 ]]\n" ] } ], "source": [ "# TF-IDF (Term Frequency & Inverse Document Frequency)\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "new_sentence = \"This is an example of term frequency. Meow meow meow meow meow!\"\n", "\n", "def calculate_tfIdf(document):\n", " tokenizer = TfidfVectorizer()\n", " tf_matrix = tokenizer.fit_transform(document)\n", " features_names = tokenizer.get_feature_names_out()\n", " return tf_matrix, features_names\n", "\n", "# Wrap the new_sentence in a list\n", "document = [new_sentence]\n", "tf_matrix, feature_names = calculate_tfIdf(document)\n", "\n", "print('TF-IDF')\n", "print(feature_names, tf_matrix.toarray())" ] }, { "cell_type": "markdown", "id": "b34b21ba-46d6-4bad-b001-aa9962cc17b0", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }