Added code for assignment A7 (text analytics) and linked in README.
This commit is contained in:
parent
9758d9f711
commit
1db9288cd7
157
Codes/Code-A7 (Text Analytics).md
Normal file
157
Codes/Code-A7 (Text Analytics).md
Normal file
@ -0,0 +1,157 @@
|
||||
# A10 - Data Visualization-3
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- In the same directory as this Jupyter notebook, create a text file (eg. simple.txt) that contains some random text.
|
||||
|
||||
---
|
||||
|
||||
1. Import libraries
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import nltk
|
||||
```
|
||||
|
||||
2. Open a text file and download all NLTK resources
|
||||
|
||||
```python3
|
||||
file=open('simple.txt','r')
|
||||
nltk.download('all')
|
||||
print(file)
|
||||
```
|
||||
|
||||
3. Read the content of the opened text file
|
||||
|
||||
```python3
|
||||
content=file.read()
|
||||
print(content)
|
||||
```
|
||||
|
||||
4. Import the sentence tokenizer from the NLTK library
|
||||
|
||||
```python3
|
||||
from nltk.tokenize import sent_tokenize
|
||||
```
|
||||
|
||||
5. Tokenize the content into sentences and print the result
|
||||
|
||||
```python3
|
||||
sentence=sent_tokenize(content)
|
||||
print(sentence)
|
||||
```
|
||||
|
||||
6. Use a regular expression tokenizer to extract words from the content
|
||||
|
||||
```python3
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
tokenizer=RegexpTokenizer(f"\w+")
|
||||
words=tokenizer.tokenize(content)
|
||||
print(words)
|
||||
```
|
||||
|
||||
7. Use a regular expression tokenizer to extract whitespace from the content
|
||||
|
||||
```python3
|
||||
tokenizer=RegexpTokenizer(f"\s")
|
||||
words=tokenizer.tokenize(content)
|
||||
print(words)
|
||||
```
|
||||
|
||||
8. Import stopwords and word tokenizer from the NLTK library
|
||||
|
||||
```python3
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
```
|
||||
|
||||
9. Retrieve and print the set of English stopwords
|
||||
|
||||
```python3
|
||||
stopWords=set(stopwords.words('english'))
|
||||
print(stopWords)
|
||||
```
|
||||
|
||||
10. Tokenize each sentence, filter out stopwords, and perform POS tagging on the filtered words
|
||||
|
||||
```python3
|
||||
for sen in sentence:
|
||||
Words=word_tokenize(sen)
|
||||
filteredWords=[word.lower() for word in Words if word.lower() not in stopWords]
|
||||
print(f"words without stopwords{filteredWords}")
|
||||
print(f"words with stopwords{Words}")
|
||||
print(f"POS Tagging{nltk.pos_tag(filteredWords)}")
|
||||
```
|
||||
|
||||
11. Print the POS tagging of the filtered words again (redundant)
|
||||
|
||||
```python3
|
||||
print(f"POS Tagging{nltk.pos_tag(filteredWords)}")
|
||||
```
|
||||
|
||||
12. Import stemming and lemmatization tools from the NLTK library
|
||||
|
||||
```python3
|
||||
from nltk.stem import PorterStemmer
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
```
|
||||
|
||||
13. Apply stemming to each word and print the original and stemmed forms
|
||||
|
||||
```python3
|
||||
stemmer=PorterStemmer()
|
||||
for word in Words:
|
||||
print(f"{word}- After Stemming = {stemmer.stem(word)}")
|
||||
```
|
||||
|
||||
14. Apply lemmatization to each word and print the original and lemmatized forms
|
||||
|
||||
```python3
|
||||
lemmatizer=WordNetLemmatizer()
|
||||
for word in Words:
|
||||
print(f"{word}:{lemmatizer.lemmatize(word)}")
|
||||
```
|
||||
|
||||
15. Create a new sentence by joining the first three sentences from the original content
|
||||
|
||||
```python3
|
||||
sentence=sentence[:3]
|
||||
new_sentence=[''.join(sentence)]
|
||||
new_sentence
|
||||
```
|
||||
|
||||
16. Import the TfidfVectorizer from the sklearn library for text feature extraction
|
||||
|
||||
```python3
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
```
|
||||
|
||||
17. Define a function to calculate the TF-IDF matrix and feature names from a document
|
||||
|
||||
```python3
|
||||
def calculate_tfIdf(document):
|
||||
tokenizer=TfidfVectorizer()
|
||||
tf_matrix=tokenizer.fit_transform(document)
|
||||
features_names=tokenizer.get_feature_names_out()
|
||||
return tf_matrix,features_names
|
||||
```
|
||||
|
||||
18. Assign the newly created sentence to the document variable for TF-IDF calculation
|
||||
|
||||
```python3
|
||||
document=new_sentence
|
||||
```
|
||||
|
||||
19. Calculate and print the TF-IDF matrix and feature names for the document
|
||||
|
||||
```python3
|
||||
tf_matrix,feature_names=calculate_tfIdf(new_sentence)
|
||||
print('TFIDF')
|
||||
feature_names,tf_matrix.toarray()
|
||||
```
|
||||
|
||||
---
|
@ -17,8 +17,9 @@
|
||||
|
||||
|
||||
1. [Code-A5 (Data Analytics-2)](Codes/Code-A5%20%28Data%20Analytics-2%29.md)
|
||||
2. [Code-A9 (Data Visualisation-2)](Codes/Code-A9%20%28Data%20Visualisation-2%29.md)
|
||||
3. [Code-A10 (Data Visualisation-3)](Codes/Code-A10%20%28Data%20Visualisation-3%29.md)
|
||||
2. [Code-A7 (Text Analytics)](Codes/Code-A7%20%28Text%20Analytics%29.md)
|
||||
3. [Code-A9 (Data Visualisation-2)](Codes/Code-A9%20%28Data%20Visualisation-2%29.md)
|
||||
4. [Code-A10 (Data Visualisation-3)](Codes/Code-A10%20%28Data%20Visualisation-3%29.md)
|
||||
|
||||
### Notebooks
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user