DataScienceAndBigDataAnalytics/Notebooks/Notebook-A3 (Descriptive Statistics).ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a46c141c-0657-4870-81d2-38d814c05877",
   "metadata": {},
   "source": [
    "# Notebook-A3 (Descriptive Statistics)\n",
    "\n",
    "- Measures of Central Tendency and variability\n",
    "- Dataset generated in this code"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
   "metadata": {},
   "source": [
    "## Problem Statement - Part 1 (data.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import library\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate data\n",
    "data = {\n",
    "    'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
    "    'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
    "    'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
    "}\n",
    "\n",
    "# Define data in DataFrame\n",
    "df = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           count          mean          std      min      25%      50%  \\\n",
      "age_group                                                                \n",
      "20-30        3.0  52333.333333  8736.894948  45000.0  47500.0  50000.0   \n",
      "30-40        2.0  62500.000000  3535.533906  60000.0  61250.0  62500.0   \n",
      "40-50        2.0  72500.000000  3535.533906  70000.0  71250.0  72500.0   \n",
      "50-60        3.0  85000.000000  5000.000000  80000.0  82500.0  85000.0   \n",
      "\n",
      "               75%      max  \n",
      "age_group                    \n",
      "20-30      56000.0  62000.0  \n",
      "30-40      63750.0  65000.0  \n",
      "40-50      73750.0  75000.0  \n",
      "50-60      87500.0  90000.0  \n"
     ]
    }
   ],
   "source": [
    "# Group the data by age_group and compute summary statistics for 'income'\n",
    "summary_stats = df.groupby('age_group')['income'].describe()\n",
    "\n",
    "# Print summary\n",
    "print(summary_stats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7bf97efc-1450-4289-a31f-15a7a9629743",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Median Income by Age Group:\n",
      "age_group\n",
      "20-30    50000.0\n",
      "30-40    62500.0\n",
      "40-50    72500.0\n",
      "50-60    85000.0\n",
      "Name: income, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
    "median_income = df.groupby('age_group')['income'].median()\n",
    "\n",
    "# Print dat median\n",
    "print(\"Median Income by Age Group:\")\n",
    "print(median_income)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# Print column names\n",
    "print(\"Column Names:\", df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "86d9558e-7e50-4982-914a-9f409b67fa19",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Modified dataset with repeated values\n",
    "data = {\n",
    "    'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
    "    'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
    "    'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
    "}\n",
    "\n",
    "# Define data in DataFrame\n",
    "df = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "87006581-7ccd-4c4e-9db0-49448f651aea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mode of Age: [25]\n",
      "Mode of Income: [50000]\n"
     ]
    }
   ],
   "source": [
    "# Calculate the mode for each column\n",
    "mode_age = df['age'].mode()\n",
    "mode_income = df['income'].mode()\n",
    "print(f\"Mode of Age: {mode_age.values}\")\n",
    "print(f\"Mode of Income: {mode_income.values}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
   "metadata": {},
   "source": [
    "## Problem Statment - Part 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   sepal.length  sepal.width  petal.length  petal.width variety\n",
      "0           5.1          3.5           1.4          0.2  Setosa\n",
      "1           4.9          3.0           1.4          0.2  Setosa\n",
      "2           4.7          3.2           1.3          0.2  Setosa\n",
      "3           4.6          3.1           1.5          0.2  Setosa\n",
      "4           5.0          3.6           1.4          0.2  Setosa\n"
     ]
    }
   ],
   "source": [
    "# Load iris.csv in the DataFrame\n",
    "df = pd.read_csv('iris.csv')\n",
    "\n",
    "print(df.head()) # Print first 5 rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e6f74058-9c12-4f09-a376-29583480ac91",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Percentiles by Species:\n",
      "                 sepal.length  sepal.width  petal.length  petal.width\n",
      "variety                                                              \n",
      "Setosa     0.25         4.800        3.200         1.400          0.2\n",
      "           0.50         5.000        3.400         1.500          0.2\n",
      "           0.75         5.200        3.675         1.575          0.3\n",
      "Versicolor 0.25         5.600        2.525         4.000          1.2\n",
      "           0.50         5.900        2.800         4.350          1.3\n",
      "           0.75         6.300        3.000         4.600          1.5\n",
      "Virginica  0.25         6.225        2.800         5.100          1.8\n",
      "           0.50         6.500        3.000         5.550          2.0\n",
      "           0.75         6.900        3.175         5.875          2.3\n"
     ]
    }
   ],
   "source": [
    "# Group the data by species and display summary statistics\n",
    "summary_stats_species = df.groupby('variety').describe()\n",
    "\n",
    "# Compute specific percentiles and statistics\n",
    "percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
    "\n",
    "# Display summary statistics and percentiles\n",
    "summary_stats_species = df.groupby('variety').describe()\n",
    "\n",
    "print(\"\\nPercentiles by Species:\")\n",
    "print(percentiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2116e46c-a150-4df9-82b9-a4684007ba59",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Summary Statistics by Species for Sepal Width:\n",
      "            count   mean       std  min    25%  50%    75%  max\n",
      "variety                                                        \n",
      "Setosa       50.0  3.428  0.379064  2.3  3.200  3.4  3.675  4.4\n",
      "Versicolor   50.0  2.770  0.313798  2.0  2.525  2.8  3.000  3.4\n",
      "Virginica    50.0  2.974  0.322497  2.2  2.800  3.0  3.175  3.8\n"
     ]
    }
   ],
   "source": [
    "# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
    "summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
    "\n",
    "print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
    "print(summary_stats_species)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "47148444-1790-46a9-93ce-dba152e58894",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Median Values by Species:\n",
      "            sepal.length  sepal.width  petal.length  petal.width\n",
      "variety                                                         \n",
      "Setosa               5.0          3.4          1.50          0.2\n",
      "Versicolor           5.9          2.8          4.35          1.3\n",
      "Virginica            6.5          3.0          5.55          2.0\n"
     ]
    }
   ],
   "source": [
    "# Group by variety and compute the median for numeric columns\n",
    "median_values = df.groupby('variety').median()\n",
    "\n",
    "print(\"Median Values by Species:\")\n",
    "print(median_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Median Sepal Length by Species:\n",
      "variety\n",
      "Setosa        5.0\n",
      "Versicolor    5.9\n",
      "Virginica     6.5\n",
      "Name: sepal.length, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
    "median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
    "print(\"Median Sepal Length by Species:\")\n",
    "print(median_sepal_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mode of Width: [3.]\n"
     ]
    }
   ],
   "source": [
    "# Calculate & print mode for sepal.width\n",
    "mode_width = df['sepal.width'].mode()\n",
    "print(f\"Mode of Width: {mode_width.values}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}