Files
DataScienceAndBigDataAnalytics/Notebooks/Notebook-A3 (Descriptive Statistics).ipynb

380 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "a46c141c-0657-4870-81d2-38d814c05877",
"metadata": {},
"source": [
"# Notebook-A3 (Descriptive Statistics)\n",
"\n",
"- Measures of Central Tendency and variability\n",
"- Dataset generated in this code"
]
},
{
"cell_type": "markdown",
"id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
"metadata": {},
"source": [
"## Problem Statement - Part 1 (data.csv)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
"metadata": {},
"outputs": [],
"source": [
"# Import library\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
"metadata": {},
"outputs": [],
"source": [
"# Generate data\n",
"data = {\n",
" 'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
" 'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
"}\n",
"\n",
"# Define data in DataFrame\n",
"df = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" count mean std min 25% 50% \\\n",
"age_group \n",
"20-30 3.0 52333.333333 8736.894948 45000.0 47500.0 50000.0 \n",
"30-40 2.0 62500.000000 3535.533906 60000.0 61250.0 62500.0 \n",
"40-50 2.0 72500.000000 3535.533906 70000.0 71250.0 72500.0 \n",
"50-60 3.0 85000.000000 5000.000000 80000.0 82500.0 85000.0 \n",
"\n",
" 75% max \n",
"age_group \n",
"20-30 56000.0 62000.0 \n",
"30-40 63750.0 65000.0 \n",
"40-50 73750.0 75000.0 \n",
"50-60 87500.0 90000.0 \n"
]
}
],
"source": [
"# Group the data by age_group and compute summary statistics for 'income'\n",
"summary_stats = df.groupby('age_group')['income'].describe()\n",
"\n",
"# Print summary\n",
"print(summary_stats)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7bf97efc-1450-4289-a31f-15a7a9629743",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Income by Age Group:\n",
"age_group\n",
"20-30 50000.0\n",
"30-40 62500.0\n",
"40-50 72500.0\n",
"50-60 85000.0\n",
"Name: income, dtype: float64\n"
]
}
],
"source": [
"# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
"median_income = df.groupby('age_group')['income'].median()\n",
"\n",
"# Print dat median\n",
"print(\"Median Income by Age Group:\")\n",
"print(median_income)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
]
}
],
"source": [
"# Print column names\n",
"print(\"Column Names:\", df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "86d9558e-7e50-4982-914a-9f409b67fa19",
"metadata": {},
"outputs": [],
"source": [
"# Modified dataset with repeated values\n",
"data = {\n",
" 'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
" 'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
"}\n",
"\n",
"# Define data in DataFrame\n",
"df = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "87006581-7ccd-4c4e-9db0-49448f651aea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mode of Age: [25]\n",
"Mode of Income: [50000]\n"
]
}
],
"source": [
"# Calculate the mode for each column\n",
"mode_age = df['age'].mode()\n",
"mode_income = df['income'].mode()\n",
"print(f\"Mode of Age: {mode_age.values}\")\n",
"print(f\"Mode of Income: {mode_income.values}\")"
]
},
{
"cell_type": "markdown",
"id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
"metadata": {},
"source": [
"## Problem Statment - Part 2"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"2 4.7 3.2 1.3 0.2 Setosa\n",
"3 4.6 3.1 1.5 0.2 Setosa\n",
"4 5.0 3.6 1.4 0.2 Setosa\n"
]
}
],
"source": [
"# Load iris.csv in the DataFrame\n",
"df = pd.read_csv('iris.csv')\n",
"\n",
"print(df.head()) # Print first 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "e6f74058-9c12-4f09-a376-29583480ac91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Percentiles by Species:\n",
" sepal.length sepal.width petal.length petal.width\n",
"variety \n",
"Setosa 0.25 4.800 3.200 1.400 0.2\n",
" 0.50 5.000 3.400 1.500 0.2\n",
" 0.75 5.200 3.675 1.575 0.3\n",
"Versicolor 0.25 5.600 2.525 4.000 1.2\n",
" 0.50 5.900 2.800 4.350 1.3\n",
" 0.75 6.300 3.000 4.600 1.5\n",
"Virginica 0.25 6.225 2.800 5.100 1.8\n",
" 0.50 6.500 3.000 5.550 2.0\n",
" 0.75 6.900 3.175 5.875 2.3\n"
]
}
],
"source": [
"# Group the data by species and display summary statistics\n",
"summary_stats_species = df.groupby('variety').describe()\n",
"\n",
"# Compute specific percentiles and statistics\n",
"percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
"\n",
"# Display summary statistics and percentiles\n",
"summary_stats_species = df.groupby('variety').describe()\n",
"\n",
"print(\"\\nPercentiles by Species:\")\n",
"print(percentiles)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2116e46c-a150-4df9-82b9-a4684007ba59",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Summary Statistics by Species for Sepal Width:\n",
" count mean std min 25% 50% 75% max\n",
"variety \n",
"Setosa 50.0 3.428 0.379064 2.3 3.200 3.4 3.675 4.4\n",
"Versicolor 50.0 2.770 0.313798 2.0 2.525 2.8 3.000 3.4\n",
"Virginica 50.0 2.974 0.322497 2.2 2.800 3.0 3.175 3.8\n"
]
}
],
"source": [
"# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
"summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
"\n",
"print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
"print(summary_stats_species)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47148444-1790-46a9-93ce-dba152e58894",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Values by Species:\n",
" sepal.length sepal.width petal.length petal.width\n",
"variety \n",
"Setosa 5.0 3.4 1.50 0.2\n",
"Versicolor 5.9 2.8 4.35 1.3\n",
"Virginica 6.5 3.0 5.55 2.0\n"
]
}
],
"source": [
"# Group by variety and compute the median for numeric columns\n",
"median_values = df.groupby('variety').median()\n",
"\n",
"print(\"Median Values by Species:\")\n",
"print(median_values)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Sepal Length by Species:\n",
"variety\n",
"Setosa 5.0\n",
"Versicolor 5.9\n",
"Virginica 6.5\n",
"Name: sepal.length, dtype: float64\n"
]
}
],
"source": [
"# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
"median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
"print(\"Median Sepal Length by Species:\")\n",
"print(median_sepal_length)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mode of Width: [3.]\n"
]
}
],
"source": [
"# Calculate & print mode for sepal.width\n",
"mode_width = df['sepal.width'].mode()\n",
"print(f\"Mode of Width: {mode_width.values}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}