Added codes, datasets and Jupyter notebooks directory.
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
# A1 - Data Wrangling-1
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas` & `numpy`
|
||||
|
||||
```shell
|
||||
pip install pandas numpy
|
||||
```
|
||||
|
||||
- Save the dataset [iris.csv](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Datasets/iris.csv) in the same directory as this Jupyter notebook.
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
```
|
||||
|
||||
2. Load the dataset from a CSV file into a pandas DataFrame:
|
||||
|
||||
```python3
|
||||
df=pd.read_csv('iris.csv')
|
||||
df.describe() # Print description of DataFrame
|
||||
```
|
||||
|
||||
3. Print first and last 5 values:
|
||||
|
||||
```python3
|
||||
print("First 5 values:\n", df.head())
|
||||
print ("Last 5 values:\n", df.tail())
|
||||
```
|
||||
|
||||
4. Print duplicated values:
|
||||
|
||||
```python3
|
||||
df.duplicated()
|
||||
```
|
||||
|
||||
5. Print null values true/false:
|
||||
|
||||
```python3
|
||||
df.isnull()
|
||||
```
|
||||
|
||||
6. Print summary of DataFrame:
|
||||
|
||||
```python3
|
||||
df.info()
|
||||
```
|
||||
|
||||
7. Print shape, i.e. rows + columns:
|
||||
|
||||
```python3
|
||||
df.shape
|
||||
```
|
||||
|
||||
8. Print null (true/false) values in `sepal.length` column:
|
||||
|
||||
```python3
|
||||
df["sepal.length"].isnull()
|
||||
```
|
||||
|
||||
9. Delete/Drop `petal.length` column:
|
||||
|
||||
```python3
|
||||
y = df.drop(["petal.length"], axis=1) # axis=1 column. For row, axis=0
|
||||
print(y)
|
||||
```
|
||||
|
||||
10. In `variety` column, replace `Setosa` with `0` and `Virginica` with `1`:
|
||||
|
||||
```python3
|
||||
df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)
|
||||
print(df)
|
||||
```
|
||||
|
||||
11. Print sum of NULL values in each column:
|
||||
|
||||
```python3
|
||||
df.isnull().sum()
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,111 @@
|
||||
# A10 - Data Visualization-3
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas`, `seaborn` & `matplotlib`
|
||||
|
||||
```shell
|
||||
pip install pandas matplotlib seaborn
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
```
|
||||
|
||||
2. Load dataset into Pandas DataFrame:
|
||||
|
||||
```python3
|
||||
df = pd.read_csv('iris.csv')
|
||||
df.head()
|
||||
```
|
||||
|
||||
3. Features & their datatypes:
|
||||
|
||||
```python3
|
||||
print("Feature and their types:")
|
||||
df.dtypes
|
||||
```
|
||||
|
||||
4. Histogram for each numerical feature:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
for i, column in enumerate(df.columns[:-1]): # Exclude 'species' column
|
||||
plt.subplot(2, 2, i + 1)
|
||||
|
||||
ax = plt.hist(df[column], edgecolor="black")
|
||||
plt.gca().bar_label(plt.gca().containers[0], fmt='%d') # Add count labels
|
||||
plt.title(f"Histogram of {column}")
|
||||
plt.xlabel(column)
|
||||
plt.ylabel("Frequency")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
5. Boxplot (for identifying outliers in this case):
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(12, 6))
|
||||
for i, column in enumerate(df.columns[:-1]): # Exclude 'species' column
|
||||
ax = plt.subplot(2, 2, i + 1)
|
||||
# Create boxplot and store it in a container
|
||||
box_container = sns.boxplot(x=df[column], ax=ax, color='salmon')
|
||||
plt.title(f"Boxplot of {column}")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
6. Detecting outliers:
|
||||
|
||||
```python3
|
||||
for column in df.columns[:-1]: # Exclude 'species' column
|
||||
Q1 = df[column].quantile(0.25)
|
||||
Q3 = df[column].quantile(0.75)
|
||||
|
||||
IQR = Q3 - Q1
|
||||
|
||||
lower_bound = Q1 - 1.5 * IQR
|
||||
upper_bound = Q3 + 1.5 * IQR
|
||||
|
||||
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
|
||||
|
||||
print(f"\nFeature: {column}")
|
||||
print(f" Mean: {df[column].mean():.2f}, Median: {df[column].median():.2f}, Std Dev: {df[column].std():.2f}")
|
||||
print(f" Outliers Detected: {'Yes' if not outliers.empty else 'No'}","\n " f"Outlier Values: {outliers.tolist()}" if not outliers.empty else "")
|
||||
print("-" * 40)
|
||||
```
|
||||
|
||||
7. Violin plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(12, 8))
|
||||
for i, column in enumerate(df.columns[:-1]): # Exclude 'species' column
|
||||
plt.subplot(2, 2, i + 1)
|
||||
sns.violinplot(x=df["variety"], y=df[column], palette="Set2", hue=df['variety'])
|
||||
plt.title(f"Violin Plot of {column} by variety")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Dataset source-1](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data) *(not exactly, but yes, kinda)*
|
||||
- [Dataset source-2](https://archive.ics.uci.edu/dataset/53/iris) *(not exactly, but yes, kinda)*
|
||||
|
||||
---
|
||||
@@ -0,0 +1,176 @@
|
||||
# A2 - Data Wrangling-2
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas` & `numpy`
|
||||
|
||||
```shell
|
||||
pip install pandas numpy
|
||||
```
|
||||
|
||||
- Dataset generated, not imported.
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
# import pandas as shriniwas
|
||||
import numpy as np
|
||||
```
|
||||
|
||||
2. Generate random data:
|
||||
|
||||
```python3
|
||||
# Generate data
|
||||
np.random.seed(50) #for consistency
|
||||
|
||||
data = {
|
||||
'Student_id': range(1, 51),
|
||||
'Name': ['Student_' + str(i) for i in range(1, 51)],
|
||||
'Age': np.random.randint(18, 25, size=50),
|
||||
'Gender': np.random.choice(['Male', 'Female'], size=50),
|
||||
'Scores': [np.random.randint(50, 100, size=3).tolist() for _ in range(50)],
|
||||
'Attendance': np.random.randint(20,100,size=50),
|
||||
'Grade': np.random.choice(['A', 'B', 'C', 'D', 'F'], size=50)
|
||||
}
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> If you wish to enter data manually, here's an example of how to do so:
|
||||
|
||||
```python3
|
||||
data = {
|
||||
'Student_id': [1,2,3,4,5,6,7,8,9,10],
|
||||
'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],
|
||||
'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],
|
||||
'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],
|
||||
'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],
|
||||
[58, 96, 61]],
|
||||
'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],
|
||||
'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']
|
||||
}
|
||||
```
|
||||
|
||||
3. Import data into DataFrame:
|
||||
|
||||
```python3
|
||||
df = pd.DataFrame(data)
|
||||
df.head() # Print first 5 rows
|
||||
```
|
||||
|
||||
4. Assign grades:
|
||||
|
||||
```python3
|
||||
def assign_grade(scores):
|
||||
avg_score = np.mean(scores)
|
||||
|
||||
if avg_score > 90:
|
||||
return 'A'
|
||||
elif avg_score > 80:
|
||||
return 'B'
|
||||
elif avg_score > 70:
|
||||
return 'C'
|
||||
elif avg_score > 60:
|
||||
return 'D'
|
||||
else:
|
||||
return 'F'
|
||||
|
||||
df['Grade'] = df['Scores'].apply(assign_grade)
|
||||
```
|
||||
|
||||
5. Introduce missing + invalid values and inconsistencies:
|
||||
|
||||
```python3
|
||||
df = pd.DataFrame(data)
|
||||
df.loc[8, 'Age'] = np.nan
|
||||
df.loc[29, 'Age'] = np.nan
|
||||
df.loc[35, 'Age'] = np.nan
|
||||
df.loc[11, 'Scores'] = None
|
||||
df.loc[19, 'Scores'] = None
|
||||
df.loc[9, 'Attendance'] = 105 # invalid percentage
|
||||
df.loc[15, 'Grade'] = 'Z' # invalid grade
|
||||
df.head(20) # Print first 20 rows
|
||||
```
|
||||
|
||||
6. Locating & printing missing/invalid values:
|
||||
|
||||
```python3
|
||||
missing_values = df.isnull().sum() #check missing values
|
||||
invalid_attendance = df[(df['Attendance'] < 0) | (df['Attendance'] > 100)]
|
||||
invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]
|
||||
|
||||
print("Missing values:\n", missing_values)
|
||||
print("Invalid attendance:\n", invalid_attendance)
|
||||
print("Invalid grades:\n", invalid_grades)
|
||||
```
|
||||
|
||||
7. Handling missing/invalid values:
|
||||
|
||||
```python3
|
||||
df['Age'] = df['Age'].fillna(df['Age'].median()) #fill by median
|
||||
df['Attendance'] = df['Attendance'].apply(lambda x: 100 if x > 100 else (0 if x < 0 else x))
|
||||
|
||||
def handle_invalid_scores(scores):
|
||||
if scores is None:
|
||||
return [0, 0, 0]
|
||||
|
||||
return [max(0, min(100, score)) for score in scores]
|
||||
|
||||
df['Scores'] = df['Scores'].apply(handle_invalid_scores)
|
||||
df['Grade'] = df['Scores'].apply(assign_grade)
|
||||
df['Grade'] = df['Grade'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'F'] else 'F')
|
||||
df.head(20) # Print first 20 rows
|
||||
```
|
||||
|
||||
8. Adding outiers:
|
||||
|
||||
```python3
|
||||
df.loc[5, 'Age'] = 35
|
||||
df.loc[5, 'Age'] = 50
|
||||
df.loc[5, 'Age'] = 65
|
||||
df.loc[10, 'Attendance'] = 200
|
||||
df.loc[12, 'Attendance'] = 175
|
||||
df.loc[12, 'Attendance'] = 166
|
||||
|
||||
print("DataFrame with Outliers:")
|
||||
print(df.iloc[5:20])
|
||||
```
|
||||
|
||||
9. Handling outliers:
|
||||
|
||||
```python3
|
||||
def handle_outliers_iqr(df, column):
|
||||
Q1 = df[column].quantile(0.25)
|
||||
Q3 = df[column].quantile(0.75)
|
||||
|
||||
IQR = Q3 - Q1
|
||||
|
||||
lower_bound = Q1 - 1.5 * IQR
|
||||
upper_bound = Q3 + 1.5 * IQR
|
||||
|
||||
df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
|
||||
|
||||
handle_outliers_iqr(df, 'Age')
|
||||
handle_outliers_iqr(df, 'Attendance')
|
||||
|
||||
print(df.iloc[5:20])
|
||||
```
|
||||
|
||||
10. Data transformation using min-max scaling:
|
||||
|
||||
```python3
|
||||
df['Scaled_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())
|
||||
|
||||
print("DataFrame with Min-Max Scaling on 'Attendance':")
|
||||
print(df[['Attendance', 'Scaled_Attendance']].head(20))
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,160 @@
|
||||
# A3 - Descriptive Statistics
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas`
|
||||
|
||||
```shell
|
||||
pip install pandas
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
### Problem Statement - Part 1 (data.csv)
|
||||
|
||||
1. Import library
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
```
|
||||
|
||||
2. Generate data and load into DataFrame:
|
||||
|
||||
```python3
|
||||
# Generate data
|
||||
data = {
|
||||
'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],
|
||||
'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],
|
||||
'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']
|
||||
}
|
||||
|
||||
# Define data in DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
```
|
||||
|
||||
3. Group data by `age_group`, compute statistics for `income` + print:
|
||||
|
||||
```python3
|
||||
# Group the data by age_group and compute summary statistics for 'income'
|
||||
summary_stats = df.groupby('age_group')['income'].describe()
|
||||
|
||||
# Print summary
|
||||
print(summary_stats)
|
||||
```
|
||||
|
||||
4. Group the data by `age_group`; Select `income` column for each of the groups created; Calculate median for `income`:
|
||||
|
||||
```python3
|
||||
# Group the data by age_group; Select income column for each of the groups created; Calculate median for income
|
||||
median_income = df.groupby('age_group')['income'].median()
|
||||
|
||||
# Print dat median
|
||||
print("Median Income by Age Group:")
|
||||
print(median_income)
|
||||
```
|
||||
|
||||
5. Print column names:
|
||||
|
||||
```python3
|
||||
print("Column Names:", df.columns)
|
||||
```
|
||||
|
||||
6. Modified dataset with repeated values; define in DataFrame:
|
||||
|
||||
```python3
|
||||
# Modified dataset with repeated values
|
||||
data = {
|
||||
'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],
|
||||
'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],
|
||||
'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']
|
||||
}
|
||||
|
||||
# Define data in DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
```
|
||||
|
||||
7. Calculate mode:
|
||||
|
||||
```python3
|
||||
# Calculate the mode for each column
|
||||
mode_age = df['age'].mode()
|
||||
mode_income = df['income'].mode()
|
||||
print(f"Mode of Age: {mode_age.values}")
|
||||
print(f"Mode of Income: {mode_income.values}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Problem Statement - Part 2 (iris.csv)
|
||||
|
||||
- Save the dataset [iris.csv](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Datasets/iris.csv) in the same directory as this Jupyter notebook.
|
||||
|
||||
1. Load dataset and print first 5 rows:
|
||||
|
||||
```python3
|
||||
# Load iris.csv in the DataFrame
|
||||
df = pd.read_csv('iris.csv')
|
||||
|
||||
print(df.head()) # Print first 5 columns
|
||||
```
|
||||
|
||||
2. Group data; Compute percentiles; Display:
|
||||
|
||||
```python3
|
||||
# Group the data by species and display summary statistics
|
||||
summary_stats_species = df.groupby('variety').describe()
|
||||
|
||||
# Compute specific percentiles and statistics
|
||||
percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])
|
||||
|
||||
# Display summary statistics and percentiles
|
||||
summary_stats_species = df.groupby('variety').describe()
|
||||
|
||||
print("\nPercentiles by Species:")
|
||||
print(percentiles)
|
||||
```
|
||||
|
||||
3. Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics:
|
||||
|
||||
```python3
|
||||
# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics
|
||||
summary_stats_species = df.groupby('variety')['sepal.width'].describe()
|
||||
|
||||
print("\nSummary Statistics by Species for Sepal Width:")
|
||||
print(summary_stats_species)
|
||||
```
|
||||
|
||||
4. Group by variety and compute the median for numeric columns:
|
||||
|
||||
```python3
|
||||
# Group by variety and compute the median for numeric columns
|
||||
median_values = df.groupby('variety').median()
|
||||
|
||||
print("Median Values by Species:")
|
||||
print(median_values)
|
||||
```
|
||||
|
||||
5. Group the data by variety; Select sepal.width column for each of the groups created; Display median:
|
||||
|
||||
```python3
|
||||
# Group the data by variety; Select sepal.width column for each of the groups created; Display median
|
||||
median_sepal_length = df.groupby('variety')['sepal.length'].median()
|
||||
print("Median Sepal Length by Species:")
|
||||
print(median_sepal_length)
|
||||
```
|
||||
|
||||
6. Calculate & print mode for sepal.width:
|
||||
|
||||
```python3
|
||||
# Calculate & print mode for sepal.width
|
||||
mode_width = df['sepal.width'].mode()
|
||||
print(f"Mode of Width: {mode_width.values}")
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,107 @@
|
||||
# A4 - Data Analytics-1
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas`, `numpy`, `seaborn`, `matplotlib` & `scikit-learn`
|
||||
|
||||
```shell
|
||||
pip install pandas numpy seaborn matplotlib
|
||||
pip install -U scikit-learn
|
||||
```
|
||||
|
||||
- Save the dataset [Assignment-A3-BostonHousing.csv](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Datasets/Assignment-A3-BostonHousing.csv) in the same directory as this Jupyter notebook.
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> Hit `Tab` key while typing library names (or anything else) to activate auto-complete in Jupyter notebook.
|
||||
|
||||
2. Load the dataset from a CSV file into a pandas DataFrame:
|
||||
|
||||
```python3
|
||||
df= pd.read_csv("Assignment-A3-BostonHousing.csv")
|
||||
df.head() # Prints first 5 rows
|
||||
```
|
||||
|
||||
3. Printing information about the DataFrame:
|
||||
|
||||
```python3
|
||||
print("Columns:\n", df.columns)
|
||||
print("Info:\n", df.info())
|
||||
print("Description:\n", df.describe())
|
||||
```
|
||||
|
||||
4. Check for missing values:
|
||||
|
||||
```python3
|
||||
print(df.isnull().sum())
|
||||
```
|
||||
|
||||
5. Correlation matrix:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(12, 10))
|
||||
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
|
||||
plt.title("Correlation Matrix")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
6. Splitting training and testing data:
|
||||
|
||||
```python3
|
||||
X = df.drop('medv', axis=1) # Deleted/Dropped "medv" (median value) column from dataset
|
||||
y = df['medv'] # Target (Median value of owner-occupied homes)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split data into 80% training and 20% testing
|
||||
# X is independent variable; y is dependent variable
|
||||
```
|
||||
|
||||
7. Linear regression and evaulation:
|
||||
|
||||
```python3
|
||||
lr = LinearRegression() # Create linear regression model object "lr"
|
||||
lr.fit(X_train, y_train) # Train linear regression model using "X_train" and "y_train"
|
||||
y_pred = lr.predict(X_test) # Make prediction on test case (X_train); predicated value stored in variable (y_pred)
|
||||
|
||||
# Evaluation
|
||||
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
|
||||
print("R-squared (R²):", r2_score(y_test, y_pred))
|
||||
```
|
||||
|
||||
8. Plotting graph:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(6,6))
|
||||
plt.scatter(y_test, y_pred, color='blue')
|
||||
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
|
||||
plt.xlabel('Actual Prices')
|
||||
plt.ylabel('Predicted Prices')
|
||||
plt.title('Actual vs Predicted Prices')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
1. [Dataset source](https://www.kaggle.com/c/boston-housing)
|
||||
|
||||
---
|
||||
@@ -0,0 +1,126 @@
|
||||
# A5 - Data Analytics-2
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas`, `numpy`, `matplotlib`, `seaborn` & `scikit-learn`
|
||||
|
||||
```shell
|
||||
pip install pandas numpy matplotlib seaborn
|
||||
pip install -U scikit-learn
|
||||
```
|
||||
|
||||
- Save the dataset [Assignment-A5-Social_Network_Ads.csv](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Datasets/Assignment-A5-Social_Network_Ads.csv) in the same directory as this Jupyter notebook.
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> Hit `Tab` key while typing library names (or anything else) to activate auto-complete in Jupyter notebook.
|
||||
|
||||
2. Load the dataset from a CSV file into a pandas DataFrame:
|
||||
|
||||
```python3
|
||||
df= pd.read_csv("Assignment-A5-Social_Network_Ads.csv")
|
||||
df.head() # Print first 5 rows
|
||||
```
|
||||
|
||||
3. Print column names of the DataFrame:
|
||||
|
||||
```python3
|
||||
df.columns
|
||||
```
|
||||
|
||||
4. Convert `Gender` to numeric; Splot data (25%, 75%):
|
||||
|
||||
```python3
|
||||
# Convert Gender to numeric
|
||||
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
|
||||
|
||||
# Features and Target
|
||||
X = df[['Gender', 'Age', 'EstimatedSalary']]
|
||||
y = df['Purchased']
|
||||
|
||||
# Split the data
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
|
||||
```
|
||||
|
||||
5. Feature scaling:
|
||||
|
||||
```python3
|
||||
sc = StandardScaler()
|
||||
X_train = sc.fit_transform(X_train)
|
||||
X_test = sc.transform(X_test)
|
||||
```
|
||||
|
||||
6. Train model and make predictions:
|
||||
|
||||
```python3
|
||||
# Train the model
|
||||
classifier = LogisticRegression()
|
||||
classifier.fit(X_train, y_train)
|
||||
# Make predictions
|
||||
y_pred = classifier.predict(X_test)
|
||||
```
|
||||
|
||||
7. Evaluate the model:
|
||||
|
||||
```python3
|
||||
# Confusion Matrix
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
print("Confusion Matrix:\n", cm)
|
||||
|
||||
# Extract values
|
||||
TN, FP, FN, TP = cm.ravel()
|
||||
|
||||
# Metrics
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
error_rate = 1 - accuracy
|
||||
precision = precision_score(y_test, y_pred)
|
||||
recall = recall_score(y_test, y_pred)
|
||||
|
||||
print(f"True Positives (TP): {TP}")
|
||||
print(f"False Positives (FP): {FP}")
|
||||
print(f"True Negatives (TN): {TN}")
|
||||
print(f"False Negatives (FN): {FN}")
|
||||
print(f"Accuracy: {accuracy:.2f}")
|
||||
print(f"Error Rate: {error_rate:.2f}")
|
||||
print(f"Precision: {precision:.2f}")
|
||||
print(f"Recall: {recall:.2f}")
|
||||
```
|
||||
|
||||
8. Visualize:
|
||||
|
||||
```python3
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
||||
plt.xlabel("Predicted")
|
||||
plt.ylabel("Actual")
|
||||
plt.title("Confusion Matrix")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
1. [Jupyter notebook](https://github.com/ganimtron-10/SPPU-2019-TE-DSBDA-Lab/blob/master/Group-A/Q5.ipynb) ❌❌❌ (not referring anymore)
|
||||
2. [Dataset source](https://www.kaggle.com/datasets/akram24/social-network-ads)
|
||||
|
||||
---
|
||||
@@ -0,0 +1,101 @@
|
||||
# A6 - Data Analytics-3
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `pandas`, `numpy`, `matplotlib`, `seaborn` & `scikit-learn`
|
||||
|
||||
```shell
|
||||
pip install pandas numpy matplotlib seaborn
|
||||
pip install -U scikit-learn
|
||||
```
|
||||
|
||||
- Save the dataset [iris.csv](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Datasets/iris.csv) in the same directory as this Jupyter notebook.
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
|
||||
```
|
||||
|
||||
2. Load the dataset from a CSV file into a Pandas DataFrame:
|
||||
|
||||
```python3
|
||||
df = pd.read_csv("iris.csv")
|
||||
df.head()
|
||||
```
|
||||
|
||||
3. Set independent & dependent variables; Train, test, split:
|
||||
|
||||
```python3
|
||||
# Set independent and dependent variables
|
||||
X = df.drop('variety', axis=1) # Independent variable
|
||||
y = df['variety'] # Dependent variable
|
||||
|
||||
# train test split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
```
|
||||
|
||||
4. Scale features:
|
||||
|
||||
```python3
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
```
|
||||
|
||||
5. Train Naive Bayes model:
|
||||
|
||||
```python3
|
||||
# Train Naive Bayes model
|
||||
model = GaussianNB()
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
# Predict
|
||||
y_pred = model.predict(X_test_scaled)
|
||||
```
|
||||
|
||||
6. Evaulate the model; Plot Confusion Matrix:
|
||||
|
||||
```python3
|
||||
# Evaluate the model
|
||||
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
|
||||
cm_df = pd.DataFrame(cm, index=model.classes_, columns=model.classes_)
|
||||
|
||||
# Plot Confusion Matrix
|
||||
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d')
|
||||
plt.title('Confusion Matrix')
|
||||
plt.xlabel('Predicted')
|
||||
plt.ylabel('Actual')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
7. Set variables for accuracy, precision, recall & error rate + print em:
|
||||
|
||||
```python3
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
precision = precision_score(y_test, y_pred, average='macro')
|
||||
recall = recall_score(y_test, y_pred, average='macro')
|
||||
error_rate = 1 - accuracy
|
||||
|
||||
print(f"Accuracy: {accuracy:.2f}")
|
||||
print(f"Error Rate: {error_rate:.2f}")
|
||||
print(f"Precision (Macro): {precision:.2f}")
|
||||
print(f"Recall (Macro): {recall:.2f}")
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,135 @@
|
||||
# A7 - Text Analytics
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `nltk`
|
||||
|
||||
```shell
|
||||
pip install nltk
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import nltk
|
||||
from nltk.tokenize import *
|
||||
from nltk.corpus import *
|
||||
from nltk.stem import *
|
||||
import re
|
||||
```
|
||||
|
||||
2. Download resources:
|
||||
|
||||
```python3
|
||||
nltk.download('all') # WARNING: ABOUT 2GBs
|
||||
```
|
||||
|
||||
> OR IF YOU'RE FEELING FANCY YOU CAN DOWNLOAD ONLY SPECIFIC RESOURCES:
|
||||
```python3
|
||||
nltk.download('punkt') # For splitting text into sentences or words
|
||||
nltk.download('stopwords') # Common stop words
|
||||
nltk.download('wordnet') # Synonyms
|
||||
nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger
|
||||
nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form
|
||||
```
|
||||
|
||||
3. Write text to perform preprocessing on:
|
||||
|
||||
```python3
|
||||
text = "Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim."
|
||||
```
|
||||
|
||||
4. Sentence tokenization:
|
||||
|
||||
```python3
|
||||
var1 = sent_tokenize(text)
|
||||
print(var1)
|
||||
```
|
||||
|
||||
5. Word tokenization:
|
||||
|
||||
```python3
|
||||
var2 = word_tokenize(text)
|
||||
print(var2)
|
||||
```
|
||||
|
||||
6. Removing punctuation:
|
||||
|
||||
```python3
|
||||
text = re.sub('[^a-zA-Z]',' ',text)
|
||||
print("After removing punctuation from text:\n", text)
|
||||
```
|
||||
|
||||
7. Removing stop words:
|
||||
|
||||
```python3
|
||||
var3 = set(stopwords.words('english'))
|
||||
print("Stop words:\n", var3)
|
||||
print("==============================================================")
|
||||
tokens = word_tokenize(text.lower())
|
||||
filtered_text = []
|
||||
for word in tokens:
|
||||
if word not in var3:
|
||||
filtered_text.append(word)
|
||||
print("Tokenized Sentence:\n", tokens)
|
||||
print("\nFiltered Sentence:\n", filtered_text)
|
||||
```
|
||||
|
||||
8. Stemmatization:
|
||||
|
||||
```python3
|
||||
var = ["write", "writing", "wrote", "writes","reading","reads"]
|
||||
ps = PorterStemmer() # brings word to its root form
|
||||
for w in var:
|
||||
root_word = ps.stem(w)
|
||||
print(root_word)
|
||||
```
|
||||
|
||||
9. Lemmatization:
|
||||
|
||||
```python3
|
||||
wordnet_lemmatizer = WordNetLemmatizer()
|
||||
text = "studies studying cries cry"
|
||||
tt = nltk.word_tokenize(text)
|
||||
print("Text is:\t", tt)
|
||||
for w in tt:
|
||||
print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))
|
||||
```
|
||||
|
||||
10. POS Tagging:
|
||||
|
||||
```python3
|
||||
text = "Hello everyone this is a sample text! Earth."
|
||||
text = nltk.word_tokenize(text)
|
||||
nltk.pos_tag(text)
|
||||
```
|
||||
|
||||
11. TF-IDF (Term Frequency & Inverse Document Frequency):
|
||||
|
||||
```python3
|
||||
# TF-IDF (Term Frequency & Inverse Document Frequency)
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
new_sentence = "This is an example of term frequency. Meow meow meow meow meow!"
|
||||
|
||||
def calculate_tfIdf(document):
|
||||
tokenizer = TfidfVectorizer()
|
||||
tf_matrix = tokenizer.fit_transform(document)
|
||||
features_names = tokenizer.get_feature_names_out()
|
||||
return tf_matrix, features_names
|
||||
|
||||
# Wrap the new_sentence in a list
|
||||
document = [new_sentence]
|
||||
tf_matrix, feature_names = calculate_tfIdf(document)
|
||||
|
||||
print('TF-IDF')
|
||||
print(feature_names, tf_matrix.toarray())
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,84 @@
|
||||
# A8 - Data Visualization-1
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `seaborn` & `matplotlib`
|
||||
|
||||
```shell
|
||||
pip install matplotlib seaborn
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import seaborn as sns
|
||||
from matplotlib import pyplot as plt
|
||||
```
|
||||
|
||||
2. Load built-in dataset:
|
||||
|
||||
```python3
|
||||
df=sns.load_dataset('titanic')
|
||||
df.head()
|
||||
```
|
||||
|
||||
3. Dist plot for age:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(6,4))
|
||||
sns.displot(df['age']) # Use sns.distplot(df['age']) for older versions of seaborn library
|
||||
plt.show()
|
||||
```
|
||||
|
||||
4. Box plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(5,3))
|
||||
bp = sns.boxplot(x='class',y='age',palette='pastel',data=df)
|
||||
plt.show()
|
||||
df.describe().transpose()
|
||||
```
|
||||
|
||||
5. Violin plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(5,4))
|
||||
vp = sns.violinplot(x='class',y='age',palette='rainbow',data=df)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
6. Hist plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(5,4))
|
||||
pq = sns.histplot(x='fare',bins=10,data=df,hue='survived',kde=False)
|
||||
for i in pq.containers:
|
||||
pq.bar_label(i)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
7. Scatter plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(5,4))
|
||||
st=sns.scatterplot(x='age',y='fare',data=df)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
8. Scatter plot:
|
||||
|
||||
```python3
|
||||
plt.figure(figsize=(5,4))
|
||||
kl=sns.scatterplot(x='age',y='fare',data=df,hue='survived')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,95 @@
|
||||
# A9 - Data Visualization-2
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
- Install required libraries: `seaborn` & `matplotlib`
|
||||
|
||||
```shell
|
||||
pip install matplotlib seaborn
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code blocks
|
||||
|
||||
1. Import libraries:
|
||||
|
||||
```python3
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from collections import Counter
|
||||
```
|
||||
|
||||
2. Load built-in dataset:
|
||||
|
||||
```python3
|
||||
df= sns.load_dataset('titanic')
|
||||
df.head()
|
||||
```
|
||||
|
||||
3. Describe:
|
||||
|
||||
```python3
|
||||
# Describe
|
||||
print(df.describe())
|
||||
# Describe - transposed, i.e. rows and columns swapped
|
||||
print(df.describe().transpose())
|
||||
```
|
||||
|
||||
4. Mean, median, mode: **(NOT SURE IF THIS IS NEEDED)**
|
||||
|
||||
```python3
|
||||
# Mean, median, mode
|
||||
age_data = df['age'].dropna() # Drop missing values in age & store in age_data var
|
||||
|
||||
sorted_age_data = sorted(age_data) # Store sorted age_data
|
||||
n = len(sorted_age_data) # Store length of age_data
|
||||
|
||||
# Calculate mean
|
||||
mean_age = sum(age_data) / len(age_data)
|
||||
|
||||
# Calculate median
|
||||
if n % 2 == 1: # odd
|
||||
median_age = sorted_age_data[n // 2]
|
||||
else: # even
|
||||
median_age = (sorted_age_data[n // 2 - 1] + sorted_age_data[n // 2]) / 2
|
||||
|
||||
# Calculate mode
|
||||
age_counts = Counter(age_data) # Count occurrences of each age
|
||||
mode_age = age_counts.most_common(1)[0][0] # Get the most common value
|
||||
|
||||
# Print
|
||||
print(f"The mean age is: {mean_age}")
|
||||
print(f"The median age is: {median_age}")
|
||||
print(f"The mode age is: {mode_age}")
|
||||
```
|
||||
|
||||
5. Boxplot:
|
||||
|
||||
```python
|
||||
plt.figure(figsize=(8,4)) # 8 by 4 inches
|
||||
sns.boxplot(x="sex", y="age", hue="survived", data= df, palette="viridis")
|
||||
plt.title("Distribution of age with respect to each gender and survival Status")
|
||||
plt.xlabel("Sex")
|
||||
plt.ylabel("Age")
|
||||
plt.show()
|
||||
```
|
||||
|
||||
6. Violin plot:
|
||||
|
||||
```python3
|
||||
sns.violinplot(x='sex',y='age',data=df, hue= 'survived')
|
||||
```
|
||||
|
||||
7. Catplot:
|
||||
|
||||
```python3
|
||||
sns.catplot(x="sex", hue="survived", data=df, kind="count")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
+198
@@ -0,0 +1,198 @@
|
||||
# Setup for Hadoop
|
||||
|
||||
This file contains instructions to install and setup Hadoop for assignment B1.
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- OpenJDK 17
|
||||
- ssh
|
||||
- curl
|
||||
- gpg (optional)
|
||||
|
||||
---
|
||||
|
||||
1. Installing OpenJDK 17:
|
||||
|
||||
```shell
|
||||
sudo apt update # Update packages
|
||||
sudo apt install -y openjdk-17-jdk gpg curl ssh # Install OpenJDK 17
|
||||
java -version # Check java (JRE) version
|
||||
javac -version # Check javac (JDK) version
|
||||
```
|
||||
|
||||
2. Setup Hadoop user:
|
||||
|
||||
```shell
|
||||
sudo adduser --home /home/hadoop hadoop # Set default password as Pass@123
|
||||
sudo usermod -aG sudo hadoop
|
||||
su - hadoop
|
||||
# Hit enter to skip entering all the information such as Full name, Room Number, etc.
|
||||
```
|
||||
|
||||
3. SSH setup:
|
||||
|
||||
```shell
|
||||
ssh-keygen -t ed25519 # Hit enter to save in defauly location. You can skip setting a passphrase for simplicity
|
||||
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
|
||||
chmod 640 ~/.ssh/authorized_keys
|
||||
ssh localhost # You will be asked for conformation, type "yes"
|
||||
su - hadoop
|
||||
```
|
||||
|
||||
4. Download Hadoop (version 3.4.1 in this case):
|
||||
|
||||
```shell
|
||||
cd ~
|
||||
curl -o hadoop-3.4.1.tar.gz https://dlcdn.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz # Download Hadoop
|
||||
curl -o hadoop-3.4.1.tar.gz.sha512 https://downloads.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz.sha512 # Download the hash
|
||||
curl -o hadoop-3.4.1.tar.gz.asc https://downloads.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz.asc # Download the signature
|
||||
curl -o KEYS https://downloads.apache.org/hadoop/common/KEYS # Download the keys
|
||||
|
||||
```
|
||||
|
||||
5. Verify the hash and extract the tarball:
|
||||
|
||||
```shell
|
||||
gpg --import KEYS # Import the keys
|
||||
gpg --verify hadoop-3.4.1.tar.gz.asc # Verify hash signature
|
||||
sha512sum -c hadoop-3.4.1.tar.gz.sha512 # Verify hash for file
|
||||
|
||||
tar -xvf hadoop-3.4.1.tar.gz
|
||||
mv hadoop-3.4.1/ hadoop/
|
||||
cd hadoop/
|
||||
mkdir -p ~/hadoopdata/hdfs/{namenode,datanode}
|
||||
```
|
||||
|
||||
6. Modify `~/.bashrc` & `$HADOOP_HOME/etc/hadoop/hadoop-env.sh`:
|
||||
|
||||
```shell
|
||||
cp /etc/bash.bashrc ~/.bashrc # Reset .bashrc file
|
||||
chown $USER ~/.bashrc # Change ownership of .bashrc file
|
||||
|
||||
echo -e "\n#JAVA+HADOOP CONFIG FROM KSKA GIT\nexport JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64\nexport PATH=\$JAVA_HOME/bin:\$PATH" >> ~/.bashrc # Java env var
|
||||
|
||||
echo -e "export HADOOP_HOME=/home/hadoop/hadoop/\nexport HADOOP_INSTALL=\$HADOOP_HOME\nexport HADOOP_MAPRED_HOME=\$HADOOP_HOME\nexport HADOOP_COMMON_HOME=\$HADOOP_HOME\nexport HADOOP_HDFS_HOME=\$HADOOP_HOME\nexport YARN_HOME=\$HADOOP_HOME\nexport HADOOP_COMMON_LIB_NATIVE_DIR=\$HADOOP_HOME/lib/native\nexport PATH=\$PATH:\$HADOOP_HOME/sbin:\$HADOOP_HOME/bin\nexport HADOOP_OPTS=\"-Djava.library.path=\$HADOOP_HOME/lib/native\"" >> ~/.bashrc # Hadoop env var
|
||||
|
||||
echo -e "PATH=\$PATH:\$HADOOP_HOME/sbin" >> ~/.bashrc
|
||||
|
||||
source ~/.bashrc
|
||||
|
||||
sed -i 's|^# export JAVA_HOME=.*|export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64/|' "$HADOOP_HOME/etc/hadoop/hadoop-env.sh"
|
||||
|
||||
```
|
||||
|
||||
7. Modify hadoop config files:
|
||||
|
||||
```shell
|
||||
sed -i '/<configuration>/,/<\/configuration>/d' $HADOOP_HOME/etc/hadoop/core-site.xml
|
||||
echo "<configuration>
|
||||
<property>
|
||||
<name>hadoop.tmp.dir</name>
|
||||
<value>/home/hadoop/tmp</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>fs.default.name</name>
|
||||
<value>hdfs://localhost:9000</value>
|
||||
</property>
|
||||
</configuration>" >> $HADOOP_HOME/etc/hadoop/core-site.xml
|
||||
|
||||
sed -i '/<configuration>/,/<\/configuration>/d' $HADOOP_HOME/etc/hadoop/hdfs-site.xml
|
||||
echo "<configuration>
|
||||
<property>
|
||||
<name>dfs.namenode.dir</name>
|
||||
<value>file:///home/hadoop/hadoopdata/hdfs/namenode</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.data.dir</name>
|
||||
<value>file:///home/hadoop/hadoopdata/hdfs/datanode</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.replication</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>" >> $HADOOP_HOME/etc/hadoop/hdfs-site.xml
|
||||
|
||||
sed -i '/<configuration>/,/<\/configuration>/d' $HADOOP_HOME/etc/hadoop/mapred-site.xml
|
||||
echo "<configuration>
|
||||
<property>
|
||||
<name>yarn.app.mapreduce.am.env</name>
|
||||
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME/home/hadoop/hadoop/bin/hadoop</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapreduce.map.env</name>
|
||||
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME/home/hadoop/hadoop/bin/hadoop</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapreduce.reduce.env</name>
|
||||
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME/home/hadoop/hadoop/bin/hadoop</value>
|
||||
</property>
|
||||
</configuration>" >> $HADOOP_HOME/etc/hadoop/mapred-site.xml
|
||||
|
||||
sed -i '/<configuration>/,/<\/configuration>/d' $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||
echo "<configuration>
|
||||
<property>
|
||||
<name>yarn.nodemanager.aux-services</name>
|
||||
<value>mapreduce_shuffle</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.hostname</name>
|
||||
<value>localhost</value>
|
||||
</property>
|
||||
</configuration>" >> $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||
|
||||
```
|
||||
|
||||
8. Format HDFS namenode:
|
||||
|
||||
```shell
|
||||
hdfs namenode -format # Format hdfs namenode
|
||||
```
|
||||
|
||||
9. Start hadoop cluster:
|
||||
|
||||
```shell
|
||||
start-all.sh
|
||||
jps
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> Visit `localhost:9870` on your browser!
|
||||
|
||||
---
|
||||
|
||||
## Manually start/stop services
|
||||
|
||||
### Start all services
|
||||
|
||||
```shell
|
||||
# start
|
||||
hdfs --daemon start namenode
|
||||
hdfs --daemon start datanode
|
||||
yarn --daemon start nodemanager
|
||||
yarn --daemon start resourcemanager
|
||||
hdfs --daemon start secondarynamenode
|
||||
hdfs dfsadmin -report
|
||||
yarn node -list
|
||||
jps # Check status
|
||||
```
|
||||
|
||||
### Stop all services
|
||||
|
||||
```shell
|
||||
# stop
|
||||
hdfs --daemon stop namenode
|
||||
hdfs --daemon stop datanode
|
||||
yarn --daemon stop resourcemanager
|
||||
yarn --daemon stop nodemanager
|
||||
hdfs --daemon stop secondarynamenode
|
||||
jps # Check status
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
1. https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
|
||||
2. https://medium.com/@abhikdey06/apache-hadoop-3-3-6-installation-on-ubuntu-22-04-14516bceec85
|
||||
@@ -0,0 +1,130 @@
|
||||
# B1 - Hadoop Word Count
|
||||
|
||||
> [!NOTE]
|
||||
> These are generic instructions, need to refine them.
|
||||
|
||||
---
|
||||
|
||||
1. Copy and paste the following code in `WordCount.java` file:
|
||||
|
||||
```java
|
||||
import java.io.IOException;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
|
||||
public class WordCount {
|
||||
|
||||
// Mapper Class
|
||||
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
|
||||
private final static IntWritable one = new IntWritable(1);
|
||||
private Text word = new Text();
|
||||
|
||||
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||
StringTokenizer itr = new StringTokenizer(value.toString());
|
||||
while (itr.hasMoreTokens()) {
|
||||
word.set(itr.nextToken().toLowerCase().replaceAll("[^a-zA-Z0-9]", ""));
|
||||
if (!word.toString().isEmpty()) {
|
||||
context.write(word, one);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reducer Class
|
||||
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||
private IntWritable result = new IntWritable();
|
||||
|
||||
public void reduce(Text key, Iterable<IntWritable> values, Context context)
|
||||
throws IOException, InterruptedException {
|
||||
int sum = 0;
|
||||
for (IntWritable val : values) {
|
||||
sum += val.get();
|
||||
}
|
||||
result.set(sum);
|
||||
context.write(key, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Usage: WordCount <input path> <output path>");
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
Job job = Job.getInstance(conf, "word count");
|
||||
|
||||
job.setJarByClass(WordCount.class);
|
||||
job.setMapperClass(TokenizerMapper.class);
|
||||
job.setCombinerClass(IntSumReducer.class); // optional
|
||||
job.setReducerClass(IntSumReducer.class);
|
||||
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(IntWritable.class);
|
||||
|
||||
FileInputFormat.addInputPath(job, new Path(args[0]));
|
||||
FileOutputFormat.setOutputPath(job, new Path(args[1]));
|
||||
|
||||
System.exit(job.waitForCompletion(true) ? 0 : 1);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
2. Create an `input.txt` file in the same directory as the above code:
|
||||
|
||||
```text
|
||||
This is a sample code.
|
||||
All the way from KSKA Git.
|
||||
Hello world
|
||||
Meow meow meow meow
|
||||
```
|
||||
|
||||
3. In the same directory, open a `Terminal` window and compile the Java code:
|
||||
|
||||
```shell
|
||||
javac -classpath `hadoop classpath` -d . WordCount.java
|
||||
jar cvf WordCount.jar *.class
|
||||
jar tf WordCount.jar
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> Compiled code will be saved in the current working directory.
|
||||
|
||||
4. Create an input and output directory in Hadoop FS:
|
||||
|
||||
```shell
|
||||
hadoop fs -mkdir /user/hadoop/input
|
||||
hadoop fs -mkdir /user/hadoop/output
|
||||
```
|
||||
|
||||
5. Upload the `input.txt` file to Hadoop FS:
|
||||
|
||||
```shell
|
||||
hadoop fs -put input.txt /user/hadoop/input/
|
||||
```
|
||||
|
||||
6. Run the WordCount job:
|
||||
|
||||
```shell
|
||||
hadoop jar WordCount.jar WordCount /user/hadoop/input/input.txt /user/hadoop/output
|
||||
```
|
||||
|
||||
7. View the output:
|
||||
|
||||
```shell
|
||||
hadoop fs -cat /user/hadoop/output/part-r-00000
|
||||
```
|
||||
|
||||
---
|
||||
BIN
Binary file not shown.
@@ -0,0 +1,72 @@
|
||||
import java.io.IOException;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
|
||||
public class WordCount {
|
||||
|
||||
// Mapper Class
|
||||
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
|
||||
private final static IntWritable one = new IntWritable(1);
|
||||
private Text word = new Text();
|
||||
|
||||
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||
StringTokenizer itr = new StringTokenizer(value.toString());
|
||||
while (itr.hasMoreTokens()) {
|
||||
word.set(itr.nextToken().toLowerCase().replaceAll("[^a-zA-Z0-9]", ""));
|
||||
if (!word.toString().isEmpty()) {
|
||||
context.write(word, one);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reducer Class
|
||||
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||
private IntWritable result = new IntWritable();
|
||||
|
||||
public void reduce(Text key, Iterable<IntWritable> values, Context context)
|
||||
throws IOException, InterruptedException {
|
||||
int sum = 0;
|
||||
for (IntWritable val : values) {
|
||||
sum += val.get();
|
||||
}
|
||||
result.set(sum);
|
||||
context.write(key, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Usage: WordCount <input path> <output path>");
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
Job job = Job.getInstance(conf, "word count");
|
||||
|
||||
job.setJarByClass(WordCount.class);
|
||||
job.setMapperClass(TokenizerMapper.class);
|
||||
job.setCombinerClass(IntSumReducer.class); // optional
|
||||
job.setReducerClass(IntSumReducer.class);
|
||||
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(IntWritable.class);
|
||||
|
||||
FileInputFormat.addInputPath(job, new Path(args[0]));
|
||||
FileOutputFormat.setOutputPath(job, new Path(args[1]));
|
||||
|
||||
System.exit(job.waitForCompletion(true) ? 0 : 1);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,117 @@
|
||||
# Additional Codes for Apache Scala (Code-B4)
|
||||
|
||||
---
|
||||
|
||||
## 1. Even-Odd Check
|
||||
|
||||
```scala
|
||||
object EvenOddCheck {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val number = 15
|
||||
if (number % 2 == 0) {
|
||||
println(s"$number is even")
|
||||
} else {
|
||||
println(s"$number is odd")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 2. Factorial
|
||||
|
||||
```scala
|
||||
object Factorial {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val num = 5
|
||||
var factorial = 1
|
||||
for (i <- 1 to num) {
|
||||
factorial *= i
|
||||
}
|
||||
println(s"The factorial of $num is $factorial")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 3. String reversal
|
||||
|
||||
```scala
|
||||
object ReverseString {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val str = "Scala"
|
||||
val reversed = str.reverse
|
||||
println(s"The reverse of '$str' is '$reversed'")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 4. Find largest element in an array
|
||||
|
||||
```scala
|
||||
object FindLargest {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val numbers = Array(10, 20, 30, 40, 50)
|
||||
val largest = numbers.max
|
||||
println(s"The largest number in the array is $largest")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 5. Sum of two numbers
|
||||
|
||||
```scala
|
||||
object SumOfTwoNumbers {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val num1 = 10
|
||||
val num2 = 20
|
||||
val sum = num1 + num2
|
||||
println(s"The sum of $num1 and $num2 is $sum")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 6. Sum of two numbers (with user input)
|
||||
|
||||
```scala
|
||||
import scala.io.StdIn
|
||||
|
||||
object AddTwoNumbers {
|
||||
def main(args: Array[String]): Unit = {
|
||||
println("Enter the first number:")
|
||||
val num1 = StdIn.readInt()
|
||||
println("Enter the second number:")
|
||||
val num2 = StdIn.readInt()
|
||||
|
||||
val sum = num1 + num2
|
||||
println(s"The sum of $num1 and $num2 is $sum")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 7. Simple Calculator
|
||||
|
||||
```scala
|
||||
import scala.io.StdIn
|
||||
|
||||
object SimpleCalculator {
|
||||
def main(args: Array[String]): Unit = {
|
||||
println("Enter the first number:")
|
||||
val num1 = StdIn.readDouble()
|
||||
println("Enter an operator (+, -, *, /):")
|
||||
val operator = StdIn.readChar()
|
||||
println("Enter the second number:")
|
||||
val num2 = StdIn.readDouble()
|
||||
|
||||
val result = operator match {
|
||||
case '+' => num1 + num2
|
||||
case '-' => num1 - num2
|
||||
case '*' => num1 * num2
|
||||
case '/' => if (num2 != 0) num1 / num2 else "undefined (division by zero)"
|
||||
case _ => "Invalid operator"
|
||||
}
|
||||
|
||||
println(s"The result is: $result")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
@@ -0,0 +1,72 @@
|
||||
# B4 - Apache Scala
|
||||
|
||||
✅ Tested and working as intended.
|
||||
|
||||
---
|
||||
|
||||
1. Open your `Terminal` and run the following commands:
|
||||
|
||||
```shell
|
||||
source ~/.bashrc # Not really needed but still
|
||||
start-master.sh # Considering the shell script is defined in PATH
|
||||
spark-shell # This will start scala CLI
|
||||
```
|
||||
|
||||
2. Enter paste mode:
|
||||
|
||||
```scala
|
||||
:paste
|
||||
```
|
||||
|
||||
3. Paste the following code:
|
||||
|
||||
```scala
|
||||
object EvenOddCheck {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val number = 15
|
||||
if (number % 2 == 0) {
|
||||
println(s"$number is even")
|
||||
} else {
|
||||
println(s"$number is odd")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> IF USER INPUT REQUIRED, USE THIS CODE INSTEAD:
|
||||
|
||||
```scala
|
||||
import scala.io.StdIn
|
||||
|
||||
object AddTwoNumbers {
|
||||
def main(args: Array[String]): Unit = {
|
||||
println("Enter the first number:")
|
||||
val num1 = StdIn.readInt()
|
||||
println("Enter the second number:")
|
||||
val num2 = StdIn.readInt()
|
||||
val sum = num1 + num2
|
||||
println(s"The sum of $num1 and $num2 is $sum")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
4. Press `Ctrl + D` to exit paste mode
|
||||
|
||||
5. Execute:
|
||||
|
||||
```scala
|
||||
EvenOddCheck.main(Array.empty[String])
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> TO EXECUTE USER INPUT WALA CODE, RUN:
|
||||
|
||||
```scala
|
||||
AddTwoNumbers.main(Array.empty[String])
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> Additional Scala codes available at [here](https://git.kska.io/sppu-te-comp-content/DataScienceAndBigDataAnalytics/src/branch/main/Codes/Code-B4%20%28Apache%20Scala%29%20-%20Alternatives.md).
|
||||
|
||||
---
|
||||
Reference in New Issue
Block a user