Files
DataScienceAndBigDataAnalytics/Codes/Code-A10 (Data Visualization-3).md
T

2.5 KiB

A10 - Data Visualization-3

Tested and working as intended.


Pre-requisites

  • Install required libraries: pandas, seaborn & matplotlib
pip install pandas matplotlib seaborn

  1. Import libraries:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
  1. Load dataset into Pandas DataFrame:
df = pd.read_csv('iris.csv')
df.head()
  1. Features & their datatypes:
print("Feature and their types:")
df.dtypes
  1. Histogram for each numerical feature:
plt.figure(figsize=(12, 6))

for i, column in enumerate(df.columns[:-1]):  # Exclude 'species' column
    plt.subplot(2, 2, i + 1)

    ax = plt.hist(df[column], edgecolor="black")
    plt.gca().bar_label(plt.gca().containers[0], fmt='%d')  # Add count labels
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()
  1. Boxplot (for identifying outliers in this case):
plt.figure(figsize=(12, 6))
for i, column in enumerate(df.columns[:-1]):  # Exclude 'species' column
    ax = plt.subplot(2, 2, i + 1)
    # Create boxplot and store it in a container
    box_container = sns.boxplot(x=df[column], ax=ax, color='salmon')
    plt.title(f"Boxplot of {column}")

plt.tight_layout()
plt.show()
  1. Detecting outliers:
for column in df.columns[:-1]:  # Exclude 'species' column
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]

    print(f"\nFeature: {column}")
    print(f"  Mean: {df[column].mean():.2f}, Median: {df[column].median():.2f}, Std Dev: {df[column].std():.2f}")
    print(f"  Outliers Detected: {'Yes' if not outliers.empty else 'No'}","\n  " f"Outlier Values: {outliers.tolist()}" if not outliers.empty else "")
    print("-" * 40)
  1. Violin plot:
plt.figure(figsize=(12, 8))
for i, column in enumerate(df.columns[:-1]):  # Exclude 'species' column
    plt.subplot(2, 2, i + 1)
    sns.violinplot(x=df["variety"], y=df[column], palette="Set2", hue=df['variety'])
    plt.title(f"Violin Plot of {column} by variety")

plt.tight_layout()
plt.show()

References