Added codes, datasets and Jupyter notebooks directory.

2025-06-11 13:48:53 +05:30
parent b3a22e9b79
commit 76dc1de8db
32 changed files with 8930 additions and 0 deletions
@@ -0,0 +1,379 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a46c141c-0657-4870-81d2-38d814c05877",
+   "metadata": {},
+   "source": [
+    "# Notebook-A3 (Descriptive Statistics)\n",
+    "\n",
+    "- Measures of Central Tendency and variability\n",
+    "- Dataset generated in this code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
+   "metadata": {},
+   "source": [
+    "## Problem Statement - Part 1 (data.csv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import library\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate data\n",
+    "data = {\n",
+    "    'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
+    "    'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
+    "    'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
+    "}\n",
+    "\n",
+    "# Define data in DataFrame\n",
+    "df = pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "           count          mean          std      min      25%      50%  \\\n",
+      "age_group                                                                \n",
+      "20-30        3.0  52333.333333  8736.894948  45000.0  47500.0  50000.0   \n",
+      "30-40        2.0  62500.000000  3535.533906  60000.0  61250.0  62500.0   \n",
+      "40-50        2.0  72500.000000  3535.533906  70000.0  71250.0  72500.0   \n",
+      "50-60        3.0  85000.000000  5000.000000  80000.0  82500.0  85000.0   \n",
+      "\n",
+      "               75%      max  \n",
+      "age_group                    \n",
+      "20-30      56000.0  62000.0  \n",
+      "30-40      63750.0  65000.0  \n",
+      "40-50      73750.0  75000.0  \n",
+      "50-60      87500.0  90000.0  \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group the data by age_group and compute summary statistics for 'income'\n",
+    "summary_stats = df.groupby('age_group')['income'].describe()\n",
+    "\n",
+    "# Print summary\n",
+    "print(summary_stats)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7bf97efc-1450-4289-a31f-15a7a9629743",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Median Income by Age Group:\n",
+      "age_group\n",
+      "20-30    50000.0\n",
+      "30-40    62500.0\n",
+      "40-50    72500.0\n",
+      "50-60    85000.0\n",
+      "Name: income, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
+    "median_income = df.groupby('age_group')['income'].median()\n",
+    "\n",
+    "# Print dat median\n",
+    "print(\"Median Income by Age Group:\")\n",
+    "print(median_income)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print column names\n",
+    "print(\"Column Names:\", df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "86d9558e-7e50-4982-914a-9f409b67fa19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Modified dataset with repeated values\n",
+    "data = {\n",
+    "    'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
+    "    'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
+    "    'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
+    "}\n",
+    "\n",
+    "# Define data in DataFrame\n",
+    "df = pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "87006581-7ccd-4c4e-9db0-49448f651aea",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mode of Age: [25]\n",
+      "Mode of Income: [50000]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate the mode for each column\n",
+    "mode_age = df['age'].mode()\n",
+    "mode_income = df['income'].mode()\n",
+    "print(f\"Mode of Age: {mode_age.values}\")\n",
+    "print(f\"Mode of Income: {mode_income.values}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
+   "metadata": {},
+   "source": [
+    "## Problem Statment - Part 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   sepal.length  sepal.width  petal.length  petal.width variety\n",
+      "0           5.1          3.5           1.4          0.2  Setosa\n",
+      "1           4.9          3.0           1.4          0.2  Setosa\n",
+      "2           4.7          3.2           1.3          0.2  Setosa\n",
+      "3           4.6          3.1           1.5          0.2  Setosa\n",
+      "4           5.0          3.6           1.4          0.2  Setosa\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load iris.csv in the DataFrame\n",
+    "df = pd.read_csv('iris.csv')\n",
+    "\n",
+    "print(df.head()) # Print first 5 rows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "e6f74058-9c12-4f09-a376-29583480ac91",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Percentiles by Species:\n",
+      "                 sepal.length  sepal.width  petal.length  petal.width\n",
+      "variety                                                              \n",
+      "Setosa     0.25         4.800        3.200         1.400          0.2\n",
+      "           0.50         5.000        3.400         1.500          0.2\n",
+      "           0.75         5.200        3.675         1.575          0.3\n",
+      "Versicolor 0.25         5.600        2.525         4.000          1.2\n",
+      "           0.50         5.900        2.800         4.350          1.3\n",
+      "           0.75         6.300        3.000         4.600          1.5\n",
+      "Virginica  0.25         6.225        2.800         5.100          1.8\n",
+      "           0.50         6.500        3.000         5.550          2.0\n",
+      "           0.75         6.900        3.175         5.875          2.3\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group the data by species and display summary statistics\n",
+    "summary_stats_species = df.groupby('variety').describe()\n",
+    "\n",
+    "# Compute specific percentiles and statistics\n",
+    "percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
+    "\n",
+    "# Display summary statistics and percentiles\n",
+    "summary_stats_species = df.groupby('variety').describe()\n",
+    "\n",
+    "print(\"\\nPercentiles by Species:\")\n",
+    "print(percentiles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2116e46c-a150-4df9-82b9-a4684007ba59",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Summary Statistics by Species for Sepal Width:\n",
+      "            count   mean       std  min    25%  50%    75%  max\n",
+      "variety                                                        \n",
+      "Setosa       50.0  3.428  0.379064  2.3  3.200  3.4  3.675  4.4\n",
+      "Versicolor   50.0  2.770  0.313798  2.0  2.525  2.8  3.000  3.4\n",
+      "Virginica    50.0  2.974  0.322497  2.2  2.800  3.0  3.175  3.8\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
+    "summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
+    "\n",
+    "print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
+    "print(summary_stats_species)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "47148444-1790-46a9-93ce-dba152e58894",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Median Values by Species:\n",
+      "            sepal.length  sepal.width  petal.length  petal.width\n",
+      "variety                                                         \n",
+      "Setosa               5.0          3.4          1.50          0.2\n",
+      "Versicolor           5.9          2.8          4.35          1.3\n",
+      "Virginica            6.5          3.0          5.55          2.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group by variety and compute the median for numeric columns\n",
+    "median_values = df.groupby('variety').median()\n",
+    "\n",
+    "print(\"Median Values by Species:\")\n",
+    "print(median_values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Median Sepal Length by Species:\n",
+      "variety\n",
+      "Setosa        5.0\n",
+      "Versicolor    5.9\n",
+      "Virginica     6.5\n",
+      "Name: sepal.length, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
+    "median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
+    "print(\"Median Sepal Length by Species:\")\n",
+    "print(median_sepal_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mode of Width: [3.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate & print mode for sepal.width\n",
+    "mode_width = df['sepal.width'].mode()\n",
+    "print(f\"Mode of Width: {mode_width.values}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}