Added codes, datasets and Jupyter notebooks directory.
This commit is contained in:
@@ -0,0 +1,379 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a46c141c-0657-4870-81d2-38d814c05877",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook-A3 (Descriptive Statistics)\n",
|
||||
"\n",
|
||||
"- Measures of Central Tendency and variability\n",
|
||||
"- Dataset generated in this code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Problem Statement - Part 1 (data.csv)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import library\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate data\n",
|
||||
"data = {\n",
|
||||
" 'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
|
||||
" 'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
|
||||
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Define data in DataFrame\n",
|
||||
"df = pd.DataFrame(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" count mean std min 25% 50% \\\n",
|
||||
"age_group \n",
|
||||
"20-30 3.0 52333.333333 8736.894948 45000.0 47500.0 50000.0 \n",
|
||||
"30-40 2.0 62500.000000 3535.533906 60000.0 61250.0 62500.0 \n",
|
||||
"40-50 2.0 72500.000000 3535.533906 70000.0 71250.0 72500.0 \n",
|
||||
"50-60 3.0 85000.000000 5000.000000 80000.0 82500.0 85000.0 \n",
|
||||
"\n",
|
||||
" 75% max \n",
|
||||
"age_group \n",
|
||||
"20-30 56000.0 62000.0 \n",
|
||||
"30-40 63750.0 65000.0 \n",
|
||||
"40-50 73750.0 75000.0 \n",
|
||||
"50-60 87500.0 90000.0 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by age_group and compute summary statistics for 'income'\n",
|
||||
"summary_stats = df.groupby('age_group')['income'].describe()\n",
|
||||
"\n",
|
||||
"# Print summary\n",
|
||||
"print(summary_stats)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "7bf97efc-1450-4289-a31f-15a7a9629743",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Income by Age Group:\n",
|
||||
"age_group\n",
|
||||
"20-30 50000.0\n",
|
||||
"30-40 62500.0\n",
|
||||
"40-50 72500.0\n",
|
||||
"50-60 85000.0\n",
|
||||
"Name: income, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
|
||||
"median_income = df.groupby('age_group')['income'].median()\n",
|
||||
"\n",
|
||||
"# Print dat median\n",
|
||||
"print(\"Median Income by Age Group:\")\n",
|
||||
"print(median_income)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print column names\n",
|
||||
"print(\"Column Names:\", df.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "86d9558e-7e50-4982-914a-9f409b67fa19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Modified dataset with repeated values\n",
|
||||
"data = {\n",
|
||||
" 'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
|
||||
" 'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
|
||||
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Define data in DataFrame\n",
|
||||
"df = pd.DataFrame(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "87006581-7ccd-4c4e-9db0-49448f651aea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Mode of Age: [25]\n",
|
||||
"Mode of Income: [50000]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Calculate the mode for each column\n",
|
||||
"mode_age = df['age'].mode()\n",
|
||||
"mode_income = df['income'].mode()\n",
|
||||
"print(f\"Mode of Age: {mode_age.values}\")\n",
|
||||
"print(f\"Mode of Income: {mode_income.values}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Problem Statment - Part 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"0 5.1 3.5 1.4 0.2 Setosa\n",
|
||||
"1 4.9 3.0 1.4 0.2 Setosa\n",
|
||||
"2 4.7 3.2 1.3 0.2 Setosa\n",
|
||||
"3 4.6 3.1 1.5 0.2 Setosa\n",
|
||||
"4 5.0 3.6 1.4 0.2 Setosa\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load iris.csv in the DataFrame\n",
|
||||
"df = pd.read_csv('iris.csv')\n",
|
||||
"\n",
|
||||
"print(df.head()) # Print first 5 rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "e6f74058-9c12-4f09-a376-29583480ac91",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Percentiles by Species:\n",
|
||||
" sepal.length sepal.width petal.length petal.width\n",
|
||||
"variety \n",
|
||||
"Setosa 0.25 4.800 3.200 1.400 0.2\n",
|
||||
" 0.50 5.000 3.400 1.500 0.2\n",
|
||||
" 0.75 5.200 3.675 1.575 0.3\n",
|
||||
"Versicolor 0.25 5.600 2.525 4.000 1.2\n",
|
||||
" 0.50 5.900 2.800 4.350 1.3\n",
|
||||
" 0.75 6.300 3.000 4.600 1.5\n",
|
||||
"Virginica 0.25 6.225 2.800 5.100 1.8\n",
|
||||
" 0.50 6.500 3.000 5.550 2.0\n",
|
||||
" 0.75 6.900 3.175 5.875 2.3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by species and display summary statistics\n",
|
||||
"summary_stats_species = df.groupby('variety').describe()\n",
|
||||
"\n",
|
||||
"# Compute specific percentiles and statistics\n",
|
||||
"percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
|
||||
"\n",
|
||||
"# Display summary statistics and percentiles\n",
|
||||
"summary_stats_species = df.groupby('variety').describe()\n",
|
||||
"\n",
|
||||
"print(\"\\nPercentiles by Species:\")\n",
|
||||
"print(percentiles)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "2116e46c-a150-4df9-82b9-a4684007ba59",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Summary Statistics by Species for Sepal Width:\n",
|
||||
" count mean std min 25% 50% 75% max\n",
|
||||
"variety \n",
|
||||
"Setosa 50.0 3.428 0.379064 2.3 3.200 3.4 3.675 4.4\n",
|
||||
"Versicolor 50.0 2.770 0.313798 2.0 2.525 2.8 3.000 3.4\n",
|
||||
"Virginica 50.0 2.974 0.322497 2.2 2.800 3.0 3.175 3.8\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
|
||||
"summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
|
||||
"\n",
|
||||
"print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
|
||||
"print(summary_stats_species)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "47148444-1790-46a9-93ce-dba152e58894",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Values by Species:\n",
|
||||
" sepal.length sepal.width petal.length petal.width\n",
|
||||
"variety \n",
|
||||
"Setosa 5.0 3.4 1.50 0.2\n",
|
||||
"Versicolor 5.9 2.8 4.35 1.3\n",
|
||||
"Virginica 6.5 3.0 5.55 2.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group by variety and compute the median for numeric columns\n",
|
||||
"median_values = df.groupby('variety').median()\n",
|
||||
"\n",
|
||||
"print(\"Median Values by Species:\")\n",
|
||||
"print(median_values)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Sepal Length by Species:\n",
|
||||
"variety\n",
|
||||
"Setosa 5.0\n",
|
||||
"Versicolor 5.9\n",
|
||||
"Virginica 6.5\n",
|
||||
"Name: sepal.length, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
|
||||
"median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
|
||||
"print(\"Median Sepal Length by Species:\")\n",
|
||||
"print(median_sepal_length)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Mode of Width: [3.]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Calculate & print mode for sepal.width\n",
|
||||
"mode_width = df['sepal.width'].mode()\n",
|
||||
"print(f\"Mode of Width: {mode_width.values}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.20"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user