From b54e56669d9ddfe7686f22635e9277d977caf592 Mon Sep 17 00:00:00 2001 From: Kshitij Date: Sun, 3 May 2026 23:13:22 +0530 Subject: [PATCH] add code blocks for practical 2b; classification. --- Codes/Code-2b.md | 202 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 Codes/Code-2b.md diff --git a/Codes/Code-2b.md b/Codes/Code-2b.md new file mode 100644 index 0000000..38f902b --- /dev/null +++ b/Codes/Code-2b.md @@ -0,0 +1,202 @@ +# Practical-2b (Classification using Deep Neural Network - IMDB Dataset) + +Problem Statement: Binary classification using Deep Neural Networks Example: Classify movie reviews into positive" reviews and "negative" reviews, just based on the text content of the reviews. Use IMDB dataset + +> [!NOTE] +> Dataset available in [Datasets](../Datasets/IMDB%20Dataset.csv) directory. + +--- + +## Pre-requisities + +1. Install packages using `pip`: `pip install tensorflow keras pandas numpy scikit-learn matplotlib seaborn` (`tensorflow` requires Python 3.9 - 3.12) +2. Copy the `IMDB Dataset.csv` dataset in the same directory as the Jupyter notebook. + +## Steps + +1. Import Libraries +2. Load Dataset +3. Exploratory Data Analysis (EDA) +4. Data Cleaning - Strip HTML Tags +5. Encode Labels and Separate Features +6. Tokenize and Pad Text Sequences +7. Split into Training and Testing Sets +8. Build the Neural Network Model +9. Compile the Model +10. Train the Model +11. Evaluate the Model on Test Data +12. Plot Training vs Validation Accuracy +13. Plot Training vs Validation Loss +14. Confusion Matrix and Classification Report + +--- + +## Code + +### 1. Import Libraries: + +```python3 +import re +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import confusion_matrix, classification_report +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +``` + +### 2. Load Dataset: + +```python3 +data = pd.read_csv('IMDB Dataset.csv') +print(data.head()) +``` + +### 3. Exploratory Data Analysis (EDA): + +```python3 +print("Shape:", data.shape) +print("\nMissing Values:\n", data.isnull().sum()) +print("\nClass Distribution:\n", data['sentiment'].value_counts()) + +# Visualize class distribution +sns.countplot(x='sentiment', data=data) +plt.title('Sentiment Class Distribution') +plt.show() + +# Sample reviews +print("\nSample positive review:\n", data[data['sentiment'] == 'positive']['review'].iloc[0][:300]) +print("\nSample negative review:\n", data[data['sentiment'] == 'negative']['review'].iloc[0][:300]) +``` + +### 4. Data Cleaning - Strip HTML Tags: + +```python3 +def clean_text(text): + text = re.sub(r'<.*?>', '', text) # remove HTML tags like
+ text = text.lower().strip() # lowercase and trim whitespace + return text + +data['review'] = data['review'].apply(clean_text) +print("Sample cleaned review:\n", data['review'].iloc[0][:300]) +``` + +### 5. Encode Labels and Separate Features: + +```python3 +label_encoder = LabelEncoder() +data['sentiment'] = label_encoder.fit_transform(data['sentiment']) # positive=1, negative=0 + +X = data['review'].values # input: review text +y = data['sentiment'].values # output: 0 or 1 +``` + +### 6. Tokenize and Pad Text Sequences: + +```python3 +vocab_size = 10000 # keep only top 10,000 most frequent words +max_length = 200 # truncate/pad all reviews to 200 words + +tokenizer = Tokenizer(num_words=vocab_size, oov_token='') # handles unknown words +tokenizer.fit_on_texts(X) # build word index from training text + +sequences = tokenizer.texts_to_sequences(X) # convert each word to its integer index +padded_sequences = pad_sequences(sequences, maxlen=max_length, + padding='post', truncating='post') # pad/truncate to fixed length +``` + +### 7. Split into Training and Testing Sets: + +```python3 +X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42) +``` + +### 8. Build the Neural Network Model: + +```python3 +model = Sequential() +model.add(Embedding(vocab_size, 16)) # maps each word index to a 16-dim vector +model.add(GlobalAveragePooling1D()) # averages all word vectors into one vector +model.add(Dense(24, activation='relu')) # hidden layer: 24 neurons +model.add(Dense(1, activation='sigmoid')) # output: probability between 0 and 1 (binary) + +model.summary() +``` + +### 9. Compile the Model: + +```python3 +# binary_crossentropy: standard loss for binary classification; sigmoid output +model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +``` + +### 10. Train the Model: + +```python3 +history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2) +``` + +### 11. Evaluate the Model on Test Data: + +```python3 +loss, accuracy = model.evaluate(X_test, y_test) +print(f"Test Loss: {loss:.4f}") +print(f"Test Accuracy: {accuracy*100:.2f}%") +``` + +### 12. Plot Training vs Validation Accuracy: + +```python3 +plt.plot(history.history['accuracy'], label='Training Accuracy') +plt.plot(history.history['val_accuracy'], label='Validation Accuracy') +plt.title('Model Accuracy Over Epochs') +plt.ylabel('Accuracy') +plt.xlabel('Epoch') +plt.legend() +plt.grid(True) +plt.show() +``` + +### 13. Plot Training vs Validation Loss: + +```python3 +plt.plot(history.history['loss'], label='Training Loss') +plt.plot(history.history['val_loss'], label='Validation Loss') +plt.title('Model Loss Over Epochs') +plt.ylabel('Loss') +plt.xlabel('Epoch') +plt.legend() +plt.grid(True) +plt.show() +``` + +### 14. Confusion Matrix and Classification Report: + +```python3 +y_pred = (model.predict(X_test) > 0.5).astype(int) # threshold 0.5: prob > 0.5 = positive + +cm = confusion_matrix(y_test, y_pred) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['Negative', 'Positive'], + yticklabels=['Negative', 'Positive']) +plt.title('Confusion Matrix') +plt.ylabel('Actual') +plt.xlabel('Predicted') +plt.show() + +print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])) +``` + +--- + +## Miscellaneous + +- [Dataset source](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) + +--- +