Files
DataScienceAndBigDataAnalytics/Notebooks/Notebook-A7 (Text Analytics).ipynb

309 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "6ed65106-23d2-4261-bf81-2a5b4b5ec60e",
"metadata": {},
"source": [
"# Notebook-A7 (Text Analytics)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "127a35f5-2434-4f6a-8904-edb4fb4f6f29",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import nltk\n",
"from nltk.tokenize import *\n",
"from nltk.corpus import *\n",
"from nltk.stem import *\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "555c6d33-75bb-4033-a9fa-60e145527464",
"metadata": {},
"outputs": [],
"source": [
"# Download resources\n",
"nltk.download('all') # WARNING: ABOUT 2GBs\n",
"\n",
"\"\"\"\n",
"OR YOU COULD DOWNLOAD ONLY SPECIFIC RESOURCES\n",
"nltk.download('punkt') # For splitting text into sentences or words\n",
"nltk.download('stopwords') # Common stop words\n",
"nltk.download('wordnet') # Synonyms\n",
"nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger\n",
"nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ba67d90c-5711-496a-bf81-a5aef68a01bd",
"metadata": {},
"outputs": [],
"source": [
"# Write text to perform preprocessing on\n",
"text = \"Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim.\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fa8d4d18-ba91-4ced-9522-849be18aba6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Hello everyone!', 'I am first name last name.', 'I am a loyal KSKA Git user all the way from Sangamwadi Empire.', 'I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain.', 'For every smart contract, I lose one strand of my hair.', 'In my free time, which by the way, I barely get, I like to swim.']\n"
]
}
],
"source": [
"# Sentence tokenization\n",
"var1 = sent_tokenize(text)\n",
"print(var1)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a53cc954-e60f-41b8-8e15-09fdc5b80328",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Hello', 'everyone', '!', 'I', 'am', 'first', 'name', 'last', 'name', '.', 'I', 'am', 'a', 'loyal', 'KSKA', 'Git', 'user', 'all', 'the', 'way', 'from', 'Sangamwadi', 'Empire', '.', 'I', 'have', 'considerable', 'knowledge', 'about', 'life', ',', 'Python', ',', 'C++', ',', 'Java', ',', 'Rust', ',', 'Golang', 'and', 'Blockchain', '.', 'For', 'every', 'smart', 'contract', ',', 'I', 'lose', 'one', 'strand', 'of', 'my', 'hair', '.', 'In', 'my', 'free', 'time', ',', 'which', 'by', 'the', 'way', ',', 'I', 'barely', 'get', ',', 'I', 'like', 'to', 'swim', '.']\n"
]
}
],
"source": [
"# Word tokenization\n",
"var2 = word_tokenize(text)\n",
"print(var2)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f259c7c3-9e94-42cb-bb94-a81176dc3126",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"After removing punctuation from text:\n",
" Hello everyone I am first name last name I am a loyal KSKA Git user all the way from Sangamwadi Empire I have considerable knowledge about life Python C Java Rust Golang and Blockchain For every smart contract I lose one strand of my hair In my free time which by the way I barely get I like to swim \n"
]
}
],
"source": [
"# Removing punctuation\n",
"text = re.sub('[^a-zA-Z]',' ',text)\n",
"print(\"After removing punctuation from text:\\n\", text)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "092b3e63-3161-4b23-be8d-ff83a829205f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Stop words:\n",
" {'do', 'most', 'more', 'am', 'aren', 'other', \"shouldn't\", 's', 'now', 'again', 'here', 'off', \"we're\", 'during', 'haven', 'above', \"we've\", 'our', \"he'll\", 'whom', 'ain', 'is', \"she'll\", 'once', \"that'll\", \"needn't\", 'shan', 'weren', 'been', 'doing', 'wasn', 'needn', 'any', 'not', \"aren't\", \"won't\", 'myself', 'couldn', 'by', 'were', 'no', \"he's\", \"shan't\", 'very', \"i'd\", 'y', 'm', 'your', 'against', 'are', 'she', \"hasn't\", \"she'd\", \"you'll\", 'because', 'mightn', 'their', \"they'd\", 'nor', 'having', 'into', 'so', \"it's\", 'don', 'who', \"haven't\", 'his', 'what', 'why', 'we', 'i', \"i'm\", 'hadn', 'over', 'and', 'her', 'to', 'ma', 'a', 'it', \"isn't\", 'under', 'o', 'until', 'an', 'same', 'them', 'did', \"they're\", 'ourselves', 'as', 'its', \"wasn't\", 'doesn', 'just', 'yourselves', 'll', 'down', 'itself', \"i've\", 'should', 'shouldn', \"mightn't\", 'on', 'these', 'or', 'only', 'd', 'hasn', 'about', 'wouldn', \"couldn't\", 're', 'mustn', 'with', \"you'd\", 'few', 'in', 'the', 'out', \"don't\", 'him', \"wouldn't\", 'can', 'through', 'from', 'those', 'for', 'didn', 'you', 'below', 'up', 'themselves', \"didn't\", 'too', 'being', 'of', 'further', 'some', \"we'd\", \"i'll\", \"it'll\", 'while', \"doesn't\", \"mustn't\", 'that', 've', 'if', 'be', 'yourself', 'he', \"hadn't\", 'how', 'than', 'was', 'will', 'before', 'my', 't', 'theirs', 'at', \"weren't\", \"should've\", 'won', \"you're\", 'own', 'isn', \"you've\", 'such', 'himself', \"she's\", 'all', 'me', 'but', \"they'll\", \"he'd\", 'after', \"they've\", 'then', 'this', 'both', 'hers', 'herself', 'ours', \"it'd\", 'which', 'where', \"we'll\", 'each', 'between', 'there', 'yours', 'had', 'have', 'has', 'when', 'does', 'they'}\n",
"==============================================================\n",
"Tokenized Sentence:\n",
" ['hello', 'everyone', 'i', 'am', 'first', 'name', 'last', 'name', 'i', 'am', 'a', 'loyal', 'kska', 'git', 'user', 'all', 'the', 'way', 'from', 'sangamwadi', 'empire', 'i', 'have', 'considerable', 'knowledge', 'about', 'life', 'python', 'c', 'java', 'rust', 'golang', 'and', 'blockchain', 'for', 'every', 'smart', 'contract', 'i', 'lose', 'one', 'strand', 'of', 'my', 'hair', 'in', 'my', 'free', 'time', 'which', 'by', 'the', 'way', 'i', 'barely', 'get', 'i', 'like', 'to', 'swim']\n",
"\n",
"Filtered Sentence:\n",
" ['hello', 'everyone', 'first', 'name', 'last', 'name', 'loyal', 'kska', 'git', 'user', 'way', 'sangamwadi', 'empire', 'considerable', 'knowledge', 'life', 'python', 'c', 'java', 'rust', 'golang', 'blockchain', 'every', 'smart', 'contract', 'lose', 'one', 'strand', 'hair', 'free', 'time', 'way', 'barely', 'get', 'like', 'swim']\n"
]
}
],
"source": [
"# Removing stop words\n",
"var3 = set(stopwords.words('english'))\n",
"print(\"Stop words:\\n\", var3)\n",
"print(\"==============================================================\")\n",
"tokens = word_tokenize(text.lower())\n",
"filtered_text = []\n",
"for word in tokens:\n",
" if word not in var3:\n",
" filtered_text.append(word)\n",
"print(\"Tokenized Sentence:\\n\", tokens)\n",
"print(\"\\nFiltered Sentence:\\n\", filtered_text)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "1b6d55c9-5724-4abb-bcb2-4fc5d27cbe12",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"write\n",
"write\n",
"wrote\n",
"write\n",
"read\n",
"read\n"
]
}
],
"source": [
"# Stemmatization\n",
"var = [\"write\", \"writing\", \"wrote\", \"writes\",\"reading\",\"reads\"]\n",
"ps = PorterStemmer() # brings word to its root form\n",
"for w in var:\n",
" root_word = ps.stem(w)\n",
" print(root_word)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6e8f62c1-d4ae-48a8-8d3e-a86366ed7972",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text is:\t ['studies', 'studying', 'cries', 'cry']\n",
"Lemma for studies is study\n",
"Lemma for studying is studying\n",
"Lemma for cries is cry\n",
"Lemma for cry is cry\n"
]
}
],
"source": [
"# Lemmatization\n",
"wordnet_lemmatizer = WordNetLemmatizer()\n",
"text = \"studies studying cries cry\"\n",
"tt = nltk.word_tokenize(text)\n",
"print(\"Text is:\\t\", tt)\n",
"for w in tt:\n",
" print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w)))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c2ff8017-d03a-412f-b1f8-e2fb0b70bfca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Hello', 'NNP'),\n",
" ('everyone', 'NN'),\n",
" ('this', 'DT'),\n",
" ('is', 'VBZ'),\n",
" ('a', 'DT'),\n",
" ('sample', 'JJ'),\n",
" ('text', 'NN'),\n",
" ('!', '.'),\n",
" ('Earth', 'NN'),\n",
" ('.', '.')]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# POS Tagging\n",
"text = \"Hello everyone this is a sample text! Earth.\"\n",
"text = nltk.word_tokenize(text)\n",
"nltk.pos_tag(text)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "1d71007a-a8cb-45ab-af27-ec69b6826ddd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TF-IDF\n",
"['an' 'example' 'frequency' 'is' 'meow' 'of' 'term' 'this'] [[0.1767767 0.1767767 0.1767767 0.1767767 0.88388348 0.1767767\n",
" 0.1767767 0.1767767 ]]\n"
]
}
],
"source": [
"# TF-IDF (Term Frequency & Inverse Document Frequency)\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"new_sentence = \"This is an example of term frequency. Meow meow meow meow meow!\"\n",
"\n",
"def calculate_tfIdf(document):\n",
" tokenizer = TfidfVectorizer()\n",
" tf_matrix = tokenizer.fit_transform(document)\n",
" features_names = tokenizer.get_feature_names_out()\n",
" return tf_matrix, features_names\n",
"\n",
"# Wrap the new_sentence in a list\n",
"document = [new_sentence]\n",
"tf_matrix, feature_names = calculate_tfIdf(document)\n",
"\n",
"print('TF-IDF')\n",
"print(feature_names, tf_matrix.toarray())"
]
},
{
"cell_type": "markdown",
"id": "b34b21ba-46d6-4bad-b001-aa9962cc17b0",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}