DataScienceAndBigDataAnalytics/Notebooks/Notebook-A7 (Text Analytics).ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6ed65106-23d2-4261-bf81-2a5b4b5ec60e",
   "metadata": {},
   "source": [
    "# Notebook-A7 (Text Analytics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "127a35f5-2434-4f6a-8904-edb4fb4f6f29",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import nltk\n",
    "from nltk.tokenize import *\n",
    "from nltk.corpus import *\n",
    "from nltk.stem import *\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "555c6d33-75bb-4033-a9fa-60e145527464",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download resources\n",
    "nltk.download('all') # WARNING: ABOUT 2GBs\n",
    "\n",
    "\"\"\"\n",
    "OR YOU COULD DOWNLOAD ONLY SPECIFIC RESOURCES\n",
    "nltk.download('punkt') # For splitting text into sentences or words\n",
    "nltk.download('stopwords') # Common stop words\n",
    "nltk.download('wordnet') # Synonyms\n",
    "nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger\n",
    "nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "ba67d90c-5711-496a-bf81-a5aef68a01bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write text to perform preprocessing on\n",
    "text = \"Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "fa8d4d18-ba91-4ced-9522-849be18aba6a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Hello everyone!', 'I am first name last name.', 'I am a loyal KSKA Git user all the way from Sangamwadi Empire.', 'I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain.', 'For every smart contract, I lose one strand of my hair.', 'In my free time, which by the way, I barely get, I like to swim.']\n"
     ]
    }
   ],
   "source": [
    "# Sentence tokenization\n",
    "var1 = sent_tokenize(text)\n",
    "print(var1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "a53cc954-e60f-41b8-8e15-09fdc5b80328",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Hello', 'everyone', '!', 'I', 'am', 'first', 'name', 'last', 'name', '.', 'I', 'am', 'a', 'loyal', 'KSKA', 'Git', 'user', 'all', 'the', 'way', 'from', 'Sangamwadi', 'Empire', '.', 'I', 'have', 'considerable', 'knowledge', 'about', 'life', ',', 'Python', ',', 'C++', ',', 'Java', ',', 'Rust', ',', 'Golang', 'and', 'Blockchain', '.', 'For', 'every', 'smart', 'contract', ',', 'I', 'lose', 'one', 'strand', 'of', 'my', 'hair', '.', 'In', 'my', 'free', 'time', ',', 'which', 'by', 'the', 'way', ',', 'I', 'barely', 'get', ',', 'I', 'like', 'to', 'swim', '.']\n"
     ]
    }
   ],
   "source": [
    "# Word tokenization\n",
    "var2 = word_tokenize(text)\n",
    "print(var2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f259c7c3-9e94-42cb-bb94-a81176dc3126",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After removing punctuation from text:\n",
      " Hello everyone  I am first name last name  I am a loyal KSKA Git user all the way from Sangamwadi Empire  I have considerable knowledge about life  Python  C    Java  Rust  Golang and Blockchain  For every smart contract  I lose one strand of my hair  In my free time  which by the way  I barely get  I like to swim \n"
     ]
    }
   ],
   "source": [
    "# Removing punctuation\n",
    "text = re.sub('[^a-zA-Z]',' ',text)\n",
    "print(\"After removing punctuation from text:\\n\", text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "092b3e63-3161-4b23-be8d-ff83a829205f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stop words:\n",
      " {'do', 'most', 'more', 'am', 'aren', 'other', \"shouldn't\", 's', 'now', 'again', 'here', 'off', \"we're\", 'during', 'haven', 'above', \"we've\", 'our', \"he'll\", 'whom', 'ain', 'is', \"she'll\", 'once', \"that'll\", \"needn't\", 'shan', 'weren', 'been', 'doing', 'wasn', 'needn', 'any', 'not', \"aren't\", \"won't\", 'myself', 'couldn', 'by', 'were', 'no', \"he's\", \"shan't\", 'very', \"i'd\", 'y', 'm', 'your', 'against', 'are', 'she', \"hasn't\", \"she'd\", \"you'll\", 'because', 'mightn', 'their', \"they'd\", 'nor', 'having', 'into', 'so', \"it's\", 'don', 'who', \"haven't\", 'his', 'what', 'why', 'we', 'i', \"i'm\", 'hadn', 'over', 'and', 'her', 'to', 'ma', 'a', 'it', \"isn't\", 'under', 'o', 'until', 'an', 'same', 'them', 'did', \"they're\", 'ourselves', 'as', 'its', \"wasn't\", 'doesn', 'just', 'yourselves', 'll', 'down', 'itself', \"i've\", 'should', 'shouldn', \"mightn't\", 'on', 'these', 'or', 'only', 'd', 'hasn', 'about', 'wouldn', \"couldn't\", 're', 'mustn', 'with', \"you'd\", 'few', 'in', 'the', 'out', \"don't\", 'him', \"wouldn't\", 'can', 'through', 'from', 'those', 'for', 'didn', 'you', 'below', 'up', 'themselves', \"didn't\", 'too', 'being', 'of', 'further', 'some', \"we'd\", \"i'll\", \"it'll\", 'while', \"doesn't\", \"mustn't\", 'that', 've', 'if', 'be', 'yourself', 'he', \"hadn't\", 'how', 'than', 'was', 'will', 'before', 'my', 't', 'theirs', 'at', \"weren't\", \"should've\", 'won', \"you're\", 'own', 'isn', \"you've\", 'such', 'himself', \"she's\", 'all', 'me', 'but', \"they'll\", \"he'd\", 'after', \"they've\", 'then', 'this', 'both', 'hers', 'herself', 'ours', \"it'd\", 'which', 'where', \"we'll\", 'each', 'between', 'there', 'yours', 'had', 'have', 'has', 'when', 'does', 'they'}\n",
      "==============================================================\n",
      "Tokenized Sentence:\n",
      " ['hello', 'everyone', 'i', 'am', 'first', 'name', 'last', 'name', 'i', 'am', 'a', 'loyal', 'kska', 'git', 'user', 'all', 'the', 'way', 'from', 'sangamwadi', 'empire', 'i', 'have', 'considerable', 'knowledge', 'about', 'life', 'python', 'c', 'java', 'rust', 'golang', 'and', 'blockchain', 'for', 'every', 'smart', 'contract', 'i', 'lose', 'one', 'strand', 'of', 'my', 'hair', 'in', 'my', 'free', 'time', 'which', 'by', 'the', 'way', 'i', 'barely', 'get', 'i', 'like', 'to', 'swim']\n",
      "\n",
      "Filtered Sentence:\n",
      " ['hello', 'everyone', 'first', 'name', 'last', 'name', 'loyal', 'kska', 'git', 'user', 'way', 'sangamwadi', 'empire', 'considerable', 'knowledge', 'life', 'python', 'c', 'java', 'rust', 'golang', 'blockchain', 'every', 'smart', 'contract', 'lose', 'one', 'strand', 'hair', 'free', 'time', 'way', 'barely', 'get', 'like', 'swim']\n"
     ]
    }
   ],
   "source": [
    "# Removing stop words\n",
    "var3 = set(stopwords.words('english'))\n",
    "print(\"Stop words:\\n\", var3)\n",
    "print(\"==============================================================\")\n",
    "tokens = word_tokenize(text.lower())\n",
    "filtered_text = []\n",
    "for word in tokens:\n",
    "  if word not in var3:\n",
    "    filtered_text.append(word)\n",
    "print(\"Tokenized Sentence:\\n\", tokens)\n",
    "print(\"\\nFiltered Sentence:\\n\", filtered_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1b6d55c9-5724-4abb-bcb2-4fc5d27cbe12",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "write\n",
      "write\n",
      "wrote\n",
      "write\n",
      "read\n",
      "read\n"
     ]
    }
   ],
   "source": [
    "# Stemmatization\n",
    "var = [\"write\", \"writing\", \"wrote\", \"writes\",\"reading\",\"reads\"]\n",
    "ps = PorterStemmer() # brings word to its root form\n",
    "for w in var:\n",
    "  root_word = ps.stem(w)\n",
    "  print(root_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6e8f62c1-d4ae-48a8-8d3e-a86366ed7972",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Text is:\t ['studies', 'studying', 'cries', 'cry']\n",
      "Lemma for studies is study\n",
      "Lemma for studying is studying\n",
      "Lemma for cries is cry\n",
      "Lemma for cry is cry\n"
     ]
    }
   ],
   "source": [
    "# Lemmatization\n",
    "wordnet_lemmatizer = WordNetLemmatizer()\n",
    "text = \"studies studying cries cry\"\n",
    "tt = nltk.word_tokenize(text)\n",
    "print(\"Text is:\\t\", tt)\n",
    "for w in tt:\n",
    "  print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c2ff8017-d03a-412f-b1f8-e2fb0b70bfca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Hello', 'NNP'),\n",
       " ('everyone', 'NN'),\n",
       " ('this', 'DT'),\n",
       " ('is', 'VBZ'),\n",
       " ('a', 'DT'),\n",
       " ('sample', 'JJ'),\n",
       " ('text', 'NN'),\n",
       " ('!', '.'),\n",
       " ('Earth', 'NN'),\n",
       " ('.', '.')]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# POS Tagging\n",
    "text = \"Hello everyone this is a sample text! Earth.\"\n",
    "text = nltk.word_tokenize(text)\n",
    "nltk.pos_tag(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "1d71007a-a8cb-45ab-af27-ec69b6826ddd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TF-IDF\n",
      "['an' 'example' 'frequency' 'is' 'meow' 'of' 'term' 'this'] [[0.1767767  0.1767767  0.1767767  0.1767767  0.88388348 0.1767767\n",
      "  0.1767767  0.1767767 ]]\n"
     ]
    }
   ],
   "source": [
    "# TF-IDF (Term Frequency & Inverse Document Frequency)\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "new_sentence = \"This is an example of term frequency. Meow meow meow meow meow!\"\n",
    "\n",
    "def calculate_tfIdf(document):\n",
    "    tokenizer = TfidfVectorizer()\n",
    "    tf_matrix = tokenizer.fit_transform(document)\n",
    "    features_names = tokenizer.get_feature_names_out()\n",
    "    return tf_matrix, features_names\n",
    "\n",
    "# Wrap the new_sentence in a list\n",
    "document = [new_sentence]\n",
    "tf_matrix, feature_names = calculate_tfIdf(document)\n",
    "\n",
    "print('TF-IDF')\n",
    "print(feature_names, tf_matrix.toarray())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b34b21ba-46d6-4bad-b001-aa9962cc17b0",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}