Compare commits
7 Commits
a0d06838c2
...
a588629812
| Author | SHA1 | Date | |
|---|---|---|---|
|
a588629812
|
|||
|
c0c22a12e7
|
|||
|
95f1dcc828
|
|||
|
ff7638bd70
|
|||
|
1432c59bc4
|
|||
|
bb9a370a98
|
|||
|
c4c460a81f
|
@@ -1,4 +1,4 @@
|
|||||||
# Practical-A1 (Uber)
|
# Practical-1 (Uber)
|
||||||
|
|
||||||
Problem Statement: Predict the price of the Uber ride from a given pickup point to the agreed drop-off location.
|
Problem Statement: Predict the price of the Uber ride from a given pickup point to the agreed drop-off location.
|
||||||
Perform following tasks:
|
Perform following tasks:
|
||||||
@@ -26,7 +26,7 @@ Perform following tasks:
|
|||||||
|
|
||||||
## Code
|
## Code
|
||||||
|
|
||||||
0. Importing Libraries:
|
### 0. Importing Libraries:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
# Import necessary libraries
|
# Import necessary libraries
|
||||||
@@ -41,7 +41,7 @@ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
|||||||
from math import radians, cos, sin, asin, sqrt
|
from math import radians, cos, sin, asin, sqrt
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Data Loading & Preprocessing:
|
### 1. Data Loading & Preprocessing:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
# Load the dataset
|
# Load the dataset
|
||||||
@@ -72,7 +72,7 @@ df.drop(['pickup_datetime', 'key'], axis=1, inplace=True, errors='ignore')
|
|||||||
print("\nColumns after feature extraction:\n", df.columns)
|
print("\nColumns after feature extraction:\n", df.columns)
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Outlier Detection & Removal:
|
### 2. Outlier Detection & Removal:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
# Remove entries with unrealistic fares
|
# Remove entries with unrealistic fares
|
||||||
@@ -87,7 +87,7 @@ df = df[(df['dropoff_longitude'] <= 180) & (df['dropoff_longitude'] >= -180)]
|
|||||||
print("Data shape after removing outliers:", df.shape)
|
print("Data shape after removing outliers:", df.shape)
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Feature Engineering - Distance Calculation:
|
### 3. Feature Engineering - Distance Calculation:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
# Define Haversine function to calculate distance between pickup and drop-off
|
# Define Haversine function to calculate distance between pickup and drop-off
|
||||||
@@ -110,7 +110,7 @@ df['distance_km'] = df.apply(lambda x: haversine(x['pickup_latitude'], x['pickup
|
|||||||
df = df[df['distance_km'] > 0]
|
df = df[df['distance_km'] > 0]
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Correlation Analysis:
|
### 4. Correlation Analysis:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
plt.figure(figsize=(10, 6))
|
plt.figure(figsize=(10, 6))
|
||||||
@@ -119,7 +119,7 @@ plt.title("Feature Correlation Heatmap")
|
|||||||
plt.show()
|
plt.show()
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Model Training:
|
### 5. Model Training:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
# Define features and target
|
# Define features and target
|
||||||
@@ -140,7 +140,7 @@ rf_model.fit(X_train, y_train)
|
|||||||
y_pred_rf = rf_model.predict(X_test)
|
y_pred_rf = rf_model.predict(X_test)
|
||||||
```
|
```
|
||||||
|
|
||||||
6. Model Evaluation:
|
### 6. Model Evaluation:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
def evaluate_model(y_true, y_pred, model_name):
|
def evaluate_model(y_true, y_pred, model_name):
|
||||||
@@ -158,7 +158,7 @@ lr_scores = evaluate_model(y_test, y_pred_lr, "Linear Regression")
|
|||||||
rf_scores = evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")
|
rf_scores = evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")
|
||||||
```
|
```
|
||||||
|
|
||||||
7. Comparison:
|
### 7. Comparison:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
results = pd.DataFrame({
|
results = pd.DataFrame({
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# Practical-A2 (Spam Email Detection)
|
# Practical-2 (Spam Email Detection)
|
||||||
|
|
||||||
Problem Statement: Classify the email using the binary classification method. Email Spam detection has two states: a) Normal State – Not Spam, b) Abnormal State – Spam. Use K-Nearest Neighbors and Support Vector Machine for classification. Analyze their performance.
|
Problem Statement: Classify the email using the binary classification method. Email Spam detection has two states: a) Normal State – Not Spam, b) Abnormal State – Spam. Use K-Nearest Neighbors and Support Vector Machine for classification. Analyze their performance.
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ Problem Statement: Classify the email using the binary classification method. Em
|
|||||||
|
|
||||||
## Code
|
## Code
|
||||||
|
|
||||||
1. Import libraries:
|
### 1. Import libraries:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -32,7 +32,7 @@ import matplotlib.pyplot as plt
|
|||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Load dataset:
|
### 2. Load dataset:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
df = pd.read_csv("emails.csv", encoding="ISO-8859-1") # Adjust path if needed
|
df = pd.read_csv("emails.csv", encoding="ISO-8859-1") # Adjust path if needed
|
||||||
@@ -53,7 +53,7 @@ print(df.columns)
|
|||||||
print(df.head(5))
|
print(df.head(5))
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Data splitting (training and testing):
|
### 3. Data splitting (training and testing):
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
@@ -61,7 +61,7 @@ X_train, X_test, y_train, y_test = train_test_split(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
4. KNN:
|
### 4. KNN:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
knn = KNeighborsClassifier(n_neighbors=5)
|
knn = KNeighborsClassifier(n_neighbors=5)
|
||||||
@@ -74,7 +74,7 @@ print("Classification Report:\n", classification_report(y_test, y_pred_knn))
|
|||||||
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
|
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
|
||||||
```
|
```
|
||||||
|
|
||||||
5. SVM:
|
### 5. SVM:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
svm = SVC(kernel='linear', random_state=42) # Linear kernel for binary classification
|
svm = SVC(kernel='linear', random_state=42) # Linear kernel for binary classification
|
||||||
@@ -87,7 +87,7 @@ print("Classification Report:\n", classification_report(y_test, y_pred_svm))
|
|||||||
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
|
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
|
||||||
```
|
```
|
||||||
|
|
||||||
6. Plotting:
|
### 6. Plotting:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
|
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# Practical-A3 (Gradient Descent Algorithm)
|
# Practical-4 (Gradient Descent Algorithm)
|
||||||
|
|
||||||
Problem Statement: Implement Gradient Descent Algorithm to find the local minima of a function. For example, find the local minima of the function y=(x+3)² starting from the point x=2.
|
Problem Statement: Implement Gradient Descent Algorithm to find the local minima of a function. For example, find the local minima of the function y=(x+3)² starting from the point x=2.
|
||||||
|
|
||||||
@@ -16,14 +16,14 @@ Problem Statement: Implement Gradient Descent Algorithm to find the local minima
|
|||||||
|
|
||||||
## Code
|
## Code
|
||||||
|
|
||||||
0. Import libraries:
|
### 0. Import libraries:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Define the function and its derivative:
|
### 1. Define the function and its derivative:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
def f(x):
|
def f(x):
|
||||||
@@ -33,7 +33,7 @@ def grad_f(x):
|
|||||||
return 2 * (x + 3) # derivative of f(x)
|
return 2 * (x + 3) # derivative of f(x)
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Initialize parameters for Gradient Descent:
|
### 2. Initialize parameters for Gradient Descent:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
x_current = 2 # starting point
|
x_current = 2 # starting point
|
||||||
@@ -43,7 +43,7 @@ max_iterations = 25 # maximum iterations
|
|||||||
history = [x_current] # sotring history
|
history = [x_current] # sotring history
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Gradient Descent Loop:
|
### 3. Gradient Descent Loop:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
for i in range(max_iterations):
|
for i in range(max_iterations):
|
||||||
@@ -60,14 +60,14 @@ for i in range(max_iterations):
|
|||||||
print(f"Iteration {i+1}: x = {x_current:.4f}, f(x) = {f(x_current):.4f}")
|
print(f"Iteration {i+1}: x = {x_current:.4f}, f(x) = {f(x_current):.4f}")
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Print the result:
|
### 4. Print the result:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
print("Local minima at x =", x_current)
|
print("Local minima at x =", x_current)
|
||||||
print("Function value at local minima y =", f(x_current))
|
print("Function value at local minima y =", f(x_current))
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Plotting:
|
### 5. Plotting:
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
plt.plot(history, [f(val) for val in history], marker='o')
|
plt.plot(history, [f(val) for val in history], marker='o')
|
||||||
+121
@@ -0,0 +1,121 @@
|
|||||||
|
# Practical-6 (Clustering)
|
||||||
|
|
||||||
|
Problem Statement: Implement K-Means clustering/ hierarchical clustering on `sales_data_sample.csv` dataset. Determine the number of clusters using the elbow method.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Dataset available in [Datasets](../Datasets/sales_data_sample.csv) directory.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
1. Import libraries
|
||||||
|
2. Load dataset
|
||||||
|
3. Select numerical features for clustering
|
||||||
|
4. Standarize data
|
||||||
|
5. K-Means clustering
|
||||||
|
6. Hierarcial clustering
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code
|
||||||
|
|
||||||
|
### 1. Import libraries:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
|
||||||
|
import seaborn as sns
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Load dataset:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
df = pd.read_csv("sales_data_sample.csv", encoding='latin1', on_bad_lines='skip')
|
||||||
|
print("Dataset shape:", df.shape)
|
||||||
|
print(df.head())
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Select numerical features for clustering:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
X = df.select_dtypes(include=['int64', 'float64'])
|
||||||
|
print("Features used for clustering:\n", X.head())
|
||||||
|
|
||||||
|
# Select relevant numeric columns
|
||||||
|
# X = df[['SALES', 'QUANTITYORDERED', 'PRICEEACH']]
|
||||||
|
|
||||||
|
# Handle missing values if any
|
||||||
|
# X = features.dropna()
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Standardize data:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
scaler = StandardScaler()
|
||||||
|
X_scaled = scaler.fit_transform(X)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. K-Means clustering:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
# Determine optimal number of clusters using Elbow Method
|
||||||
|
wcss = []
|
||||||
|
for k in range(1, 11):
|
||||||
|
kmeans = KMeans(n_clusters=k, random_state=42)
|
||||||
|
kmeans.fit(X_scaled)
|
||||||
|
wcss.append(kmeans.inertia_)
|
||||||
|
|
||||||
|
# Plot Elbow Method
|
||||||
|
plt.figure(figsize=(6,4))
|
||||||
|
plt.plot(range(1, 11), wcss, marker='o')
|
||||||
|
plt.title('Elbow Method')
|
||||||
|
plt.xlabel('Number of clusters (k)')
|
||||||
|
plt.ylabel('Inertia (WCSS)')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Fit KMeans with chosen number of clusters (example: 3 clusters)
|
||||||
|
kmeans = KMeans(n_clusters=3, random_state=42) # Add n_init=10 param in the function to suppress warnings
|
||||||
|
clusters_kmeans = kmeans.fit_predict(X_scaled)
|
||||||
|
df['KMeans_Cluster'] = clusters_kmeans
|
||||||
|
|
||||||
|
# Visualize clusters
|
||||||
|
sns.scatterplot(x='SALES', y='PRICEEACH', hue='KMeans_Cluster', data=df, palette='viridis')
|
||||||
|
plt.title("K-Means Clustering")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print("\nK-Means Cluster Centers:\n", kmeans.cluster_centers_)
|
||||||
|
print("\nCluster counts:\n", df['KMeans_Cluster'].value_counts())
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Hierarchial clustering:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
# Create linkage matrix
|
||||||
|
Z = linkage(X_scaled, method='ward')
|
||||||
|
|
||||||
|
# Plot dendrogram
|
||||||
|
plt.figure(figsize=(10,5))
|
||||||
|
dendrogram(Z)
|
||||||
|
plt.title('Hierarchical Clustering Dendrogram')
|
||||||
|
plt.xlabel('Samples')
|
||||||
|
plt.ylabel('Distance')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Assign clusters (example: 3 clusters)
|
||||||
|
clusters_hier = fcluster(Z, t=3, criterion='maxclust')
|
||||||
|
df['Hierarchical_Cluster'] = clusters_hier
|
||||||
|
|
||||||
|
print("\nHierarchical Cluster counts:\n", pd.Series(clusters_hier).value_counts())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Miscellaneous
|
||||||
|
|
||||||
|
- [Dataset source](https://www.kaggle.com/datasets/kyanyoga/sample-sales-data)
|
||||||
|
|
||||||
|
---
|
||||||
Executable → Regular
|
Can't render this file because it is too large.
|
File diff suppressed because it is too large
Load Diff
Executable → Regular
|
Can't render this file because it is too large.
|
@@ -5,7 +5,7 @@
|
|||||||
"id": "df16d02a-fc85-4581-a2d5-8ab2d896d918",
|
"id": "df16d02a-fc85-4581-a2d5-8ab2d896d918",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Practical-A1 (Uber)\n",
|
"# Practical-1 (Uber)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"---\n",
|
"---\n",
|
||||||
"\n",
|
"\n",
|
||||||
Regular → Executable
+1
-1
@@ -5,7 +5,7 @@
|
|||||||
"id": "d1b71a9c-e3b2-43d4-85fa-4329d90e47b9",
|
"id": "d1b71a9c-e3b2-43d4-85fa-4329d90e47b9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Practical-A2 (Email Spam Detection)\n",
|
"# Practical-2 (Email Spam Detection)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"---\n",
|
"---\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
"id": "d2bfa2a8-f2e1-45cc-9b2c-55f4b4dd629e",
|
"id": "d2bfa2a8-f2e1-45cc-9b2c-55f4b4dd629e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Practical-A4 (Gradient Descent Algorithm)\n",
|
"# Practical-4 (Gradient Descent Algorithm)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"---\n",
|
"---\n",
|
||||||
"\n",
|
"\n",
|
||||||
Executable
+362
File diff suppressed because one or more lines are too long
@@ -10,6 +10,24 @@ This repository contains vital resources for the Machine Learning course under t
|
|||||||
|
|
||||||
### Codes
|
### Codes
|
||||||
|
|
||||||
|
1. [Code-1 (Uber)](Codes/Code-1.md)
|
||||||
|
2. [Code-2 (Spam Email Detection)](Codes/Code-2.md)
|
||||||
|
3. [Code-4 (Gradient Descent Algorithm)](Codes/Code-4.md)
|
||||||
|
4. [Code-6 (Clustering)](Codes/Code-6.md)
|
||||||
|
|
||||||
|
### Jupyter Notebooks
|
||||||
|
|
||||||
|
1. [Notebook-1 (Uber)](Notebooks/Notebook-1.ipynb)
|
||||||
|
2. [Notebook-2 (Spam Email Detection)](Notebooks/Notebook-2.ipynb)
|
||||||
|
3. [Notebook-4 (Gradient Descent Algorithm)](Notebooks/Notebook-4.ipynb)
|
||||||
|
4. [Notebook-6 (Clustering)](Notebooks/Notebook-6.ipynb)
|
||||||
|
|
||||||
|
### Datasets
|
||||||
|
|
||||||
|
1. [Dataset for Practical-1](Datasets/uber.csv)
|
||||||
|
2. [Dataset for Practical-2](Datasets/emails.csv)
|
||||||
|
3. [Dataset for Practical-3](Datasets/sales_data_sample.csv)
|
||||||
|
|
||||||
### Assignments
|
### Assignments
|
||||||
|
|
||||||
- Assignment-1:
|
- Assignment-1:
|
||||||
|
|||||||
Reference in New Issue
Block a user