add code blocks for practical 2b; classification.
This commit is contained in:
@@ -0,0 +1,202 @@
|
||||
# Practical-2b (Classification using Deep Neural Network - IMDB Dataset)
|
||||
|
||||
Problem Statement: Binary classification using Deep Neural Networks Example: Classify movie reviews into positive" reviews and "negative" reviews, just based on the text content of the reviews. Use IMDB dataset
|
||||
|
||||
> [!NOTE]
|
||||
> Dataset available in [Datasets](../Datasets/IMDB%20Dataset.csv) directory.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisities
|
||||
|
||||
1. Install packages using `pip`: `pip install tensorflow keras pandas numpy scikit-learn matplotlib seaborn` (`tensorflow` requires Python 3.9 - 3.12)
|
||||
2. Copy the `IMDB Dataset.csv` dataset in the same directory as the Jupyter notebook.
|
||||
|
||||
## Steps
|
||||
|
||||
1. Import Libraries
|
||||
2. Load Dataset
|
||||
3. Exploratory Data Analysis (EDA)
|
||||
4. Data Cleaning - Strip HTML Tags
|
||||
5. Encode Labels and Separate Features
|
||||
6. Tokenize and Pad Text Sequences
|
||||
7. Split into Training and Testing Sets
|
||||
8. Build the Neural Network Model
|
||||
9. Compile the Model
|
||||
10. Train the Model
|
||||
11. Evaluate the Model on Test Data
|
||||
12. Plot Training vs Validation Accuracy
|
||||
13. Plot Training vs Validation Loss
|
||||
14. Confusion Matrix and Classification Report
|
||||
|
||||
---
|
||||
|
||||
## Code
|
||||
|
||||
### 1. Import Libraries:
|
||||
|
||||
```python3
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.metrics import confusion_matrix, classification_report
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
```
|
||||
|
||||
### 2. Load Dataset:
|
||||
|
||||
```python3
|
||||
data = pd.read_csv('IMDB Dataset.csv')
|
||||
print(data.head())
|
||||
```
|
||||
|
||||
### 3. Exploratory Data Analysis (EDA):
|
||||
|
||||
```python3
|
||||
print("Shape:", data.shape)
|
||||
print("\nMissing Values:\n", data.isnull().sum())
|
||||
print("\nClass Distribution:\n", data['sentiment'].value_counts())
|
||||
|
||||
# Visualize class distribution
|
||||
sns.countplot(x='sentiment', data=data)
|
||||
plt.title('Sentiment Class Distribution')
|
||||
plt.show()
|
||||
|
||||
# Sample reviews
|
||||
print("\nSample positive review:\n", data[data['sentiment'] == 'positive']['review'].iloc[0][:300])
|
||||
print("\nSample negative review:\n", data[data['sentiment'] == 'negative']['review'].iloc[0][:300])
|
||||
```
|
||||
|
||||
### 4. Data Cleaning - Strip HTML Tags:
|
||||
|
||||
```python3
|
||||
def clean_text(text):
|
||||
text = re.sub(r'<.*?>', '', text) # remove HTML tags like <br />
|
||||
text = text.lower().strip() # lowercase and trim whitespace
|
||||
return text
|
||||
|
||||
data['review'] = data['review'].apply(clean_text)
|
||||
print("Sample cleaned review:\n", data['review'].iloc[0][:300])
|
||||
```
|
||||
|
||||
### 5. Encode Labels and Separate Features:
|
||||
|
||||
```python3
|
||||
label_encoder = LabelEncoder()
|
||||
data['sentiment'] = label_encoder.fit_transform(data['sentiment']) # positive=1, negative=0
|
||||
|
||||
X = data['review'].values # input: review text
|
||||
y = data['sentiment'].values # output: 0 or 1
|
||||
```
|
||||
|
||||
### 6. Tokenize and Pad Text Sequences:
|
||||
|
||||
```python3
|
||||
vocab_size = 10000 # keep only top 10,000 most frequent words
|
||||
max_length = 200 # truncate/pad all reviews to 200 words
|
||||
|
||||
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>') # <OOV> handles unknown words
|
||||
tokenizer.fit_on_texts(X) # build word index from training text
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(X) # convert each word to its integer index
|
||||
padded_sequences = pad_sequences(sequences, maxlen=max_length,
|
||||
padding='post', truncating='post') # pad/truncate to fixed length
|
||||
```
|
||||
|
||||
### 7. Split into Training and Testing Sets:
|
||||
|
||||
```python3
|
||||
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)
|
||||
```
|
||||
|
||||
### 8. Build the Neural Network Model:
|
||||
|
||||
```python3
|
||||
model = Sequential()
|
||||
model.add(Embedding(vocab_size, 16)) # maps each word index to a 16-dim vector
|
||||
model.add(GlobalAveragePooling1D()) # averages all word vectors into one vector
|
||||
model.add(Dense(24, activation='relu')) # hidden layer: 24 neurons
|
||||
model.add(Dense(1, activation='sigmoid')) # output: probability between 0 and 1 (binary)
|
||||
|
||||
model.summary()
|
||||
```
|
||||
|
||||
### 9. Compile the Model:
|
||||
|
||||
```python3
|
||||
# binary_crossentropy: standard loss for binary classification; sigmoid output
|
||||
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||
```
|
||||
|
||||
### 10. Train the Model:
|
||||
|
||||
```python3
|
||||
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
|
||||
```
|
||||
|
||||
### 11. Evaluate the Model on Test Data:
|
||||
|
||||
```python3
|
||||
loss, accuracy = model.evaluate(X_test, y_test)
|
||||
print(f"Test Loss: {loss:.4f}")
|
||||
print(f"Test Accuracy: {accuracy*100:.2f}%")
|
||||
```
|
||||
|
||||
### 12. Plot Training vs Validation Accuracy:
|
||||
|
||||
```python3
|
||||
plt.plot(history.history['accuracy'], label='Training Accuracy')
|
||||
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
|
||||
plt.title('Model Accuracy Over Epochs')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.xlabel('Epoch')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
### 13. Plot Training vs Validation Loss:
|
||||
|
||||
```python3
|
||||
plt.plot(history.history['loss'], label='Training Loss')
|
||||
plt.plot(history.history['val_loss'], label='Validation Loss')
|
||||
plt.title('Model Loss Over Epochs')
|
||||
plt.ylabel('Loss')
|
||||
plt.xlabel('Epoch')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
### 14. Confusion Matrix and Classification Report:
|
||||
|
||||
```python3
|
||||
y_pred = (model.predict(X_test) > 0.5).astype(int) # threshold 0.5: prob > 0.5 = positive
|
||||
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['Negative', 'Positive'],
|
||||
yticklabels=['Negative', 'Positive'])
|
||||
plt.title('Confusion Matrix')
|
||||
plt.ylabel('Actual')
|
||||
plt.xlabel('Predicted')
|
||||
plt.show()
|
||||
|
||||
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Miscellaneous
|
||||
|
||||
- [Dataset source](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
|
||||
|
||||
---
|
||||
|
||||
Reference in New Issue
Block a user