Added codes, datasets and Jupyter notebooks directory.
This commit is contained in:
@@ -0,0 +1,571 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ed97617-c927-48b4-857c-c4d75b98fe2c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook-A1 (Data Wrangling-1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a93305d3-03c9-479e-bde3-fb0d06ea8d39",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import libraries\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8bf62160-0384-4a3f-b25e-11e28c8b8df4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>sepal.length</th>\n",
|
||||
" <th>sepal.width</th>\n",
|
||||
" <th>petal.length</th>\n",
|
||||
" <th>petal.width</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>150.000000</td>\n",
|
||||
" <td>150.000000</td>\n",
|
||||
" <td>150.000000</td>\n",
|
||||
" <td>150.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>5.843333</td>\n",
|
||||
" <td>3.057333</td>\n",
|
||||
" <td>3.758000</td>\n",
|
||||
" <td>1.199333</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>0.828066</td>\n",
|
||||
" <td>0.435866</td>\n",
|
||||
" <td>1.765298</td>\n",
|
||||
" <td>0.762238</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td>4.300000</td>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>0.100000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>5.100000</td>\n",
|
||||
" <td>2.800000</td>\n",
|
||||
" <td>1.600000</td>\n",
|
||||
" <td>0.300000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>5.800000</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>4.350000</td>\n",
|
||||
" <td>1.300000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>6.400000</td>\n",
|
||||
" <td>3.300000</td>\n",
|
||||
" <td>5.100000</td>\n",
|
||||
" <td>1.800000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>7.900000</td>\n",
|
||||
" <td>4.400000</td>\n",
|
||||
" <td>6.900000</td>\n",
|
||||
" <td>2.500000</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" sepal.length sepal.width petal.length petal.width\n",
|
||||
"count 150.000000 150.000000 150.000000 150.000000\n",
|
||||
"mean 5.843333 3.057333 3.758000 1.199333\n",
|
||||
"std 0.828066 0.435866 1.765298 0.762238\n",
|
||||
"min 4.300000 2.000000 1.000000 0.100000\n",
|
||||
"25% 5.100000 2.800000 1.600000 0.300000\n",
|
||||
"50% 5.800000 3.000000 4.350000 1.300000\n",
|
||||
"75% 6.400000 3.300000 5.100000 1.800000\n",
|
||||
"max 7.900000 4.400000 6.900000 2.500000"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load the dataset into DataFrame\n",
|
||||
"df=pd.read_csv('iris.csv')\n",
|
||||
"df.describe() # Print description of DataFrame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "7c813f91-5100-463a-a848-2ef9f46344bc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"First 5 values:\n",
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"0 5.1 3.5 1.4 0.2 Setosa\n",
|
||||
"1 4.9 3.0 1.4 0.2 Setosa\n",
|
||||
"2 4.7 3.2 1.3 0.2 Setosa\n",
|
||||
"3 4.6 3.1 1.5 0.2 Setosa\n",
|
||||
"4 5.0 3.6 1.4 0.2 Setosa\n",
|
||||
"Last 5 values:\n",
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"145 6.7 3.0 5.2 2.3 Virginica\n",
|
||||
"146 6.3 2.5 5.0 1.9 Virginica\n",
|
||||
"147 6.5 3.0 5.2 2.0 Virginica\n",
|
||||
"148 6.2 3.4 5.4 2.3 Virginica\n",
|
||||
"149 5.9 3.0 5.1 1.8 Virginica\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print first and last 5 values\n",
|
||||
"print(\"First 5 values:\\n\", df.head())\n",
|
||||
"print (\"Last 5 values:\\n\", df.tail())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "78fa6d44-7c39-4306-a5d7-0fd5ff94243a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 False\n",
|
||||
"1 False\n",
|
||||
"2 False\n",
|
||||
"3 False\n",
|
||||
"4 False\n",
|
||||
" ... \n",
|
||||
"145 False\n",
|
||||
"146 False\n",
|
||||
"147 False\n",
|
||||
"148 False\n",
|
||||
"149 False\n",
|
||||
"Length: 150, dtype: bool"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print duplicated values\n",
|
||||
"df.duplicated()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5da58b1d-c458-4eb5-b23c-053c02934efd",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>sepal.length</th>\n",
|
||||
" <th>sepal.width</th>\n",
|
||||
" <th>petal.length</th>\n",
|
||||
" <th>petal.width</th>\n",
|
||||
" <th>variety</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>145</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>146</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>147</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>148</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>149</th>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>150 rows × 5 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"0 False False False False False\n",
|
||||
"1 False False False False False\n",
|
||||
"2 False False False False False\n",
|
||||
"3 False False False False False\n",
|
||||
"4 False False False False False\n",
|
||||
".. ... ... ... ... ...\n",
|
||||
"145 False False False False False\n",
|
||||
"146 False False False False False\n",
|
||||
"147 False False False False False\n",
|
||||
"148 False False False False False\n",
|
||||
"149 False False False False False\n",
|
||||
"\n",
|
||||
"[150 rows x 5 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print null values true/false\n",
|
||||
"df.isnull()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3b6face6-d366-4a05-9fd4-baff133e24f6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 150 entries, 0 to 149\n",
|
||||
"Data columns (total 5 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 sepal.length 150 non-null float64\n",
|
||||
" 1 sepal.width 150 non-null float64\n",
|
||||
" 2 petal.length 150 non-null float64\n",
|
||||
" 3 petal.width 150 non-null float64\n",
|
||||
" 4 variety 150 non-null object \n",
|
||||
"dtypes: float64(4), object(1)\n",
|
||||
"memory usage: 6.0+ KB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print summary of DataFrame\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f08a00a4-c82a-49a9-bbaa-427d5cc4db96",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(150, 5)"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print shape, i.e. rows + columns\n",
|
||||
"df.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "186866fe-7614-4a4d-b929-31bd37f80027",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 False\n",
|
||||
"1 False\n",
|
||||
"2 False\n",
|
||||
"3 False\n",
|
||||
"4 False\n",
|
||||
" ... \n",
|
||||
"145 False\n",
|
||||
"146 False\n",
|
||||
"147 False\n",
|
||||
"148 False\n",
|
||||
"149 False\n",
|
||||
"Name: sepal.length, Length: 150, dtype: bool"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print null (true/false) values in sepal.length column\n",
|
||||
"df[\"sepal.length\"].isnull()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "5bc1e082-9818-450d-b76d-5dd3a2103e00",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" sepal.length sepal.width petal.width variety\n",
|
||||
"0 5.1 3.5 0.2 Setosa\n",
|
||||
"1 4.9 3.0 0.2 Setosa\n",
|
||||
"2 4.7 3.2 0.2 Setosa\n",
|
||||
"3 4.6 3.1 0.2 Setosa\n",
|
||||
"4 5.0 3.6 0.2 Setosa\n",
|
||||
".. ... ... ... ...\n",
|
||||
"145 6.7 3.0 2.3 Virginica\n",
|
||||
"146 6.3 2.5 1.9 Virginica\n",
|
||||
"147 6.5 3.0 2.0 Virginica\n",
|
||||
"148 6.2 3.4 2.3 Virginica\n",
|
||||
"149 5.9 3.0 1.8 Virginica\n",
|
||||
"\n",
|
||||
"[150 rows x 4 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Delete/Drop petal.length column\n",
|
||||
"y = df.drop([\"petal.length\"], axis=1) # axis=1 column. For row, axis=0\n",
|
||||
"print(y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "dcccff24-c97e-4832-9cc1-cd701a3a9a34",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"0 5.1 3.5 1.4 0.2 0\n",
|
||||
"1 4.9 3.0 1.4 0.2 0\n",
|
||||
"2 4.7 3.2 1.3 0.2 0\n",
|
||||
"3 4.6 3.1 1.5 0.2 0\n",
|
||||
"4 5.0 3.6 1.4 0.2 0\n",
|
||||
".. ... ... ... ... ...\n",
|
||||
"145 6.7 3.0 5.2 2.3 1\n",
|
||||
"146 6.3 2.5 5.0 1.9 1\n",
|
||||
"147 6.5 3.0 5.2 2.0 1\n",
|
||||
"148 6.2 3.4 5.4 2.3 1\n",
|
||||
"149 5.9 3.0 5.1 1.8 1\n",
|
||||
"\n",
|
||||
"[150 rows x 5 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# In variety column, replace Setosa with 0 and Virginica with 1\n",
|
||||
"df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)\n",
|
||||
"print(df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "97d22793-657e-4df7-a7f1-01d70886ef57",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"sepal.length 0\n",
|
||||
"sepal.width 0\n",
|
||||
"petal.length 0\n",
|
||||
"petal.width 0\n",
|
||||
"variety 0\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print sum of NULL values in each column\n",
|
||||
"df.isnull().sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3f0c5c4-a930-4dbc-8300-99fb7cb7c991",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.20"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,983 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6ce3d040-a50f-459c-b494-e172f2897780",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook-A2 (Data Wrangling-2)\n",
|
||||
"\n",
|
||||
"- “Academic performance” wala\n",
|
||||
"- Dataset generated here, not imported"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "ce0d3ca6-fec0-4d3f-82b5-8ef92256525a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import libraries\n",
|
||||
"import pandas as pd\n",
|
||||
"# import pandas as shriniwas\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "24b47cb9-c955-4325-ad62-4b215d73398c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"\\nIF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\\n\\ndata = {\\n 'Student_id': [1,2,3,4,5,6,7,8,9,10],\\n 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\\n 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\\n 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\\n 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\\n [58, 96, 61]],\\n 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\\n 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\\n}\\n\""
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Generate data\n",
|
||||
"np.random.seed(50) #for consistency\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" 'Student_id': range(1, 51),\n",
|
||||
" 'Name': ['Student_' + str(i) for i in range(1, 51)],\n",
|
||||
" 'Age': np.random.randint(18, 25, size=50),\n",
|
||||
" 'Gender': np.random.choice(['Male', 'Female'], size=50),\n",
|
||||
" 'Scores': [np.random.randint(50, 100, size=3).tolist() for _ in range(50)],\n",
|
||||
" 'Attendance': np.random.randint(20,100,size=50),\n",
|
||||
" 'Grade': np.random.choice(['A', 'B', 'C', 'D', 'F'], size=50)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"IF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" 'Student_id': [1,2,3,4,5,6,7,8,9,10],\n",
|
||||
" 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\n",
|
||||
" 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\n",
|
||||
" 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\n",
|
||||
" 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\n",
|
||||
" [58, 96, 61]],\n",
|
||||
" 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\n",
|
||||
" 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\n",
|
||||
"}\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "ba84d792-8cff-4a94-b936-6c6ae0bd8527",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Student_id</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>Gender</th>\n",
|
||||
" <th>Scores</th>\n",
|
||||
" <th>Attendance</th>\n",
|
||||
" <th>Grade</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Student_1</td>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[64, 54, 72]</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Student_2</td>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[93, 69, 82]</td>\n",
|
||||
" <td>23</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Student_3</td>\n",
|
||||
" <td>21</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[87, 90, 80]</td>\n",
|
||||
" <td>84</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>Student_4</td>\n",
|
||||
" <td>23</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[94, 93, 85]</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>Student_5</td>\n",
|
||||
" <td>19</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[88, 77, 78]</td>\n",
|
||||
" <td>32</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"0 1 Student_1 18 Female [64, 54, 72] 55 B\n",
|
||||
"1 2 Student_2 18 Male [93, 69, 82] 23 C\n",
|
||||
"2 3 Student_3 21 Female [87, 90, 80] 84 F\n",
|
||||
"3 4 Student_4 23 Female [94, 93, 85] 66 C\n",
|
||||
"4 5 Student_5 19 Male [88, 77, 78] 32 F"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Import data into DataFrame\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df.head() # Print first 5 rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "8d9d0a15-5a81-4b3d-90de-0efd631b0ec8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Assign grades\n",
|
||||
"def assign_grade(scores):\n",
|
||||
" avg_score = np.mean(scores)\n",
|
||||
"\n",
|
||||
" if avg_score > 90:\n",
|
||||
" return 'A'\n",
|
||||
" elif avg_score > 80:\n",
|
||||
" return 'B'\n",
|
||||
" elif avg_score > 70:\n",
|
||||
" return 'C'\n",
|
||||
" elif avg_score > 60:\n",
|
||||
" return 'D'\n",
|
||||
" else:\n",
|
||||
" return 'F'\n",
|
||||
"\n",
|
||||
"df['Grade'] = df['Scores'].apply(assign_grade)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "b2480d57-75c9-48d3-94ec-5b3f0a459007",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Student_id</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>Gender</th>\n",
|
||||
" <th>Scores</th>\n",
|
||||
" <th>Attendance</th>\n",
|
||||
" <th>Grade</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Student_1</td>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[64, 54, 72]</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Student_2</td>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[93, 69, 82]</td>\n",
|
||||
" <td>23</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Student_3</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[87, 90, 80]</td>\n",
|
||||
" <td>84</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>Student_4</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[94, 93, 85]</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>Student_5</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[88, 77, 78]</td>\n",
|
||||
" <td>32</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>Student_6</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[81, 90, 65]</td>\n",
|
||||
" <td>96</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>7</td>\n",
|
||||
" <td>Student_7</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[55, 97, 54]</td>\n",
|
||||
" <td>73</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>Student_8</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[54, 68, 97]</td>\n",
|
||||
" <td>41</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>Student_9</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[92, 67, 76]</td>\n",
|
||||
" <td>98</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>Student_10</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[58, 96, 61]</td>\n",
|
||||
" <td>105</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td>Student_11</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[77, 77, 57]</td>\n",
|
||||
" <td>65</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>12</td>\n",
|
||||
" <td>Student_12</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>53</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>Student_13</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[85, 53, 71]</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>14</td>\n",
|
||||
" <td>Student_14</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[92, 53, 56]</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>15</td>\n",
|
||||
" <td>Student_15</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[65, 81, 72]</td>\n",
|
||||
" <td>63</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>16</td>\n",
|
||||
" <td>Student_16</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[50, 61, 80]</td>\n",
|
||||
" <td>52</td>\n",
|
||||
" <td>Z</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>Student_17</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[83, 99, 64]</td>\n",
|
||||
" <td>88</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>Student_18</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[76, 72, 96]</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>19</td>\n",
|
||||
" <td>Student_19</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[87, 56, 80]</td>\n",
|
||||
" <td>79</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>20</td>\n",
|
||||
" <td>Student_20</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>61</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"0 1 Student_1 18.0 Female [64, 54, 72] 55 B\n",
|
||||
"1 2 Student_2 18.0 Male [93, 69, 82] 23 C\n",
|
||||
"2 3 Student_3 21.0 Female [87, 90, 80] 84 F\n",
|
||||
"3 4 Student_4 23.0 Female [94, 93, 85] 66 C\n",
|
||||
"4 5 Student_5 19.0 Male [88, 77, 78] 32 F\n",
|
||||
"5 6 Student_6 24.0 Male [81, 90, 65] 96 D\n",
|
||||
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
|
||||
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
|
||||
"8 9 Student_9 NaN Male [92, 67, 76] 98 C\n",
|
||||
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
|
||||
"10 11 Student_11 24.0 Female [77, 77, 57] 65 D\n",
|
||||
"11 12 Student_12 23.0 Male None 53 A\n",
|
||||
"12 13 Student_13 23.0 Male [85, 53, 71] 74 C\n",
|
||||
"13 14 Student_14 20.0 Female [92, 53, 56] 70 A\n",
|
||||
"14 15 Student_15 20.0 Male [65, 81, 72] 63 D\n",
|
||||
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n",
|
||||
"16 17 Student_17 24.0 Female [83, 99, 64] 88 C\n",
|
||||
"17 18 Student_18 21.0 Female [76, 72, 96] 70 D\n",
|
||||
"18 19 Student_19 22.0 Male [87, 56, 80] 79 B\n",
|
||||
"19 20 Student_20 21.0 Male None 61 C"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Introduce missing + invalid values and inconsistencies\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df.loc[8, 'Age'] = np.nan\n",
|
||||
"df.loc[29, 'Age'] = np.nan\n",
|
||||
"df.loc[35, 'Age'] = np.nan\n",
|
||||
"df.loc[11, 'Scores'] = None\n",
|
||||
"df.loc[19, 'Scores'] = None\n",
|
||||
"df.loc[9, 'Attendance'] = 105 # invalid percentage\n",
|
||||
"df.loc[15, 'Grade'] = 'Z' # invalid grade\n",
|
||||
"df.head(20) # Print first 20 rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "629f27c9-bca3-404f-8c8f-fae7a3882db8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Missing values:\n",
|
||||
" Student_id 0\n",
|
||||
"Name 0\n",
|
||||
"Age 3\n",
|
||||
"Gender 0\n",
|
||||
"Scores 2\n",
|
||||
"Attendance 0\n",
|
||||
"Grade 0\n",
|
||||
"dtype: int64\n",
|
||||
"Invalid attendance:\n",
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
|
||||
"Invalid grades:\n",
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Locating & printing missing/invalid values\n",
|
||||
"missing_values = df.isnull().sum() #check missing values\n",
|
||||
"invalid_attendance = df[(df['Attendance'] < 0) | (df['Attendance'] > 100)]\n",
|
||||
"invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]\n",
|
||||
"\n",
|
||||
"print(\"Missing values:\\n\", missing_values)\n",
|
||||
"print(\"Invalid attendance:\\n\", invalid_attendance)\n",
|
||||
"print(\"Invalid grades:\\n\", invalid_grades)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "4e46dfbe-693b-4f3b-a9a3-a0d243cd5214",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Student_id</th>\n",
|
||||
" <th>Name</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>Gender</th>\n",
|
||||
" <th>Scores</th>\n",
|
||||
" <th>Attendance</th>\n",
|
||||
" <th>Grade</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Student_1</td>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[64, 54, 72]</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>Student_2</td>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[93, 69, 82]</td>\n",
|
||||
" <td>23</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Student_3</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[87, 90, 80]</td>\n",
|
||||
" <td>84</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>Student_4</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[94, 93, 85]</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>Student_5</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[88, 77, 78]</td>\n",
|
||||
" <td>32</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>Student_6</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[81, 90, 65]</td>\n",
|
||||
" <td>96</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>7</td>\n",
|
||||
" <td>Student_7</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[55, 97, 54]</td>\n",
|
||||
" <td>73</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>Student_8</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[54, 68, 97]</td>\n",
|
||||
" <td>41</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>Student_9</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[92, 67, 76]</td>\n",
|
||||
" <td>98</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>Student_10</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[58, 96, 61]</td>\n",
|
||||
" <td>100</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td>Student_11</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[77, 77, 57]</td>\n",
|
||||
" <td>65</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>12</td>\n",
|
||||
" <td>Student_12</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[0, 0, 0]</td>\n",
|
||||
" <td>53</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>Student_13</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[85, 53, 71]</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>14</td>\n",
|
||||
" <td>Student_14</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[92, 53, 56]</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>15</td>\n",
|
||||
" <td>Student_15</td>\n",
|
||||
" <td>20.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[65, 81, 72]</td>\n",
|
||||
" <td>63</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>16</td>\n",
|
||||
" <td>Student_16</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[50, 61, 80]</td>\n",
|
||||
" <td>52</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>Student_17</td>\n",
|
||||
" <td>24.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[83, 99, 64]</td>\n",
|
||||
" <td>88</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>Student_18</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>[76, 72, 96]</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>19</td>\n",
|
||||
" <td>Student_19</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[87, 56, 80]</td>\n",
|
||||
" <td>79</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>20</td>\n",
|
||||
" <td>Student_20</td>\n",
|
||||
" <td>21.0</td>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>[0, 0, 0]</td>\n",
|
||||
" <td>61</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"0 1 Student_1 18.0 Female [64, 54, 72] 55 D\n",
|
||||
"1 2 Student_2 18.0 Male [93, 69, 82] 23 B\n",
|
||||
"2 3 Student_3 21.0 Female [87, 90, 80] 84 B\n",
|
||||
"3 4 Student_4 23.0 Female [94, 93, 85] 66 A\n",
|
||||
"4 5 Student_5 19.0 Male [88, 77, 78] 32 B\n",
|
||||
"5 6 Student_6 24.0 Male [81, 90, 65] 96 C\n",
|
||||
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
|
||||
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
|
||||
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
|
||||
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
|
||||
"10 11 Student_11 24.0 Female [77, 77, 57] 65 C\n",
|
||||
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
|
||||
"12 13 Student_13 23.0 Male [85, 53, 71] 74 D\n",
|
||||
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
|
||||
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
|
||||
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
|
||||
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
|
||||
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
|
||||
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
|
||||
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Handling missing/invalid values\n",
|
||||
"df['Age'] = df['Age'].fillna(df['Age'].median()) #fill by median\n",
|
||||
"df['Attendance'] = df['Attendance'].apply(lambda x: 100 if x > 100 else (0 if x < 0 else x))\n",
|
||||
"\n",
|
||||
"def handle_invalid_scores(scores):\n",
|
||||
" if scores is None:\n",
|
||||
" return [0, 0, 0]\n",
|
||||
"\n",
|
||||
" return [max(0, min(100, score)) for score in scores]\n",
|
||||
"\n",
|
||||
"df['Scores'] = df['Scores'].apply(handle_invalid_scores)\n",
|
||||
"df['Grade'] = df['Scores'].apply(assign_grade)\n",
|
||||
"df['Grade'] = df['Grade'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'F'] else 'F')\n",
|
||||
"df.head(20) # Print first 20 rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "f7cfc3bb-91c4-4fa8-b723-ccd786cd8626",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"DataFrame with Outliers:\n",
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"5 6 Student_6 65.0 Male [81, 90, 65] 96 C\n",
|
||||
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
|
||||
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
|
||||
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
|
||||
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
|
||||
"10 11 Student_11 24.0 Female [77, 77, 57] 200 C\n",
|
||||
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
|
||||
"12 13 Student_13 23.0 Male [85, 53, 71] 166 D\n",
|
||||
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
|
||||
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
|
||||
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
|
||||
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
|
||||
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
|
||||
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
|
||||
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Adding outiers\n",
|
||||
"df.loc[5, 'Age'] = 35\n",
|
||||
"df.loc[5, 'Age'] = 50\n",
|
||||
"df.loc[5, 'Age'] = 65\n",
|
||||
"df.loc[10, 'Attendance'] = 200\n",
|
||||
"df.loc[12, 'Attendance'] = 175\n",
|
||||
"df.loc[12, 'Attendance'] = 166\n",
|
||||
"\n",
|
||||
"print(\"DataFrame with Outliers:\")\n",
|
||||
"print(df.iloc[5:20])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "92e0593b-99df-4e08-a8da-ee923936cb91",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Student_id Name Age Gender Scores Attendance Grade\n",
|
||||
"5 6 Student_6 26.875 Male [81, 90, 65] 96.00 C\n",
|
||||
"6 7 Student_7 22.000 Female [55, 97, 54] 73.00 D\n",
|
||||
"7 8 Student_8 24.000 Male [54, 68, 97] 41.00 C\n",
|
||||
"8 9 Student_9 21.000 Male [92, 67, 76] 98.00 C\n",
|
||||
"9 10 Student_10 24.000 Female [58, 96, 61] 100.00 C\n",
|
||||
"10 11 Student_11 24.000 Female [77, 77, 57] 142.25 C\n",
|
||||
"11 12 Student_12 23.000 Male [0, 0, 0] 53.00 F\n",
|
||||
"12 13 Student_13 23.000 Male [85, 53, 71] 142.25 D\n",
|
||||
"13 14 Student_14 20.000 Female [92, 53, 56] 70.00 D\n",
|
||||
"14 15 Student_15 20.000 Male [65, 81, 72] 63.00 C\n",
|
||||
"15 16 Student_16 22.000 Male [50, 61, 80] 52.00 D\n",
|
||||
"16 17 Student_17 24.000 Female [83, 99, 64] 88.00 B\n",
|
||||
"17 18 Student_18 21.000 Female [76, 72, 96] 70.00 B\n",
|
||||
"18 19 Student_19 22.000 Male [87, 56, 80] 79.00 C\n",
|
||||
"19 20 Student_20 21.000 Male [0, 0, 0] 61.00 F\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Handling outliers\n",
|
||||
"def handle_outliers_iqr(df, column):\n",
|
||||
" Q1 = df[column].quantile(0.25)\n",
|
||||
" Q3 = df[column].quantile(0.75)\n",
|
||||
"\n",
|
||||
" IQR = Q3 - Q1\n",
|
||||
"\n",
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
"\n",
|
||||
" df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))\n",
|
||||
"\n",
|
||||
"handle_outliers_iqr(df, 'Age')\n",
|
||||
"handle_outliers_iqr(df, 'Attendance')\n",
|
||||
"\n",
|
||||
"print(df.iloc[5:20])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "9866733a-c095-402d-b6b0-0fd88ef31169",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"DataFrame with Min-Max Scaling on 'Attendance':\n",
|
||||
" Attendance Scaled_Attendance\n",
|
||||
"0 55.00 0.286299\n",
|
||||
"1 23.00 0.024540\n",
|
||||
"2 84.00 0.523517\n",
|
||||
"3 66.00 0.376278\n",
|
||||
"4 32.00 0.098160\n",
|
||||
"5 96.00 0.621677\n",
|
||||
"6 73.00 0.433538\n",
|
||||
"7 41.00 0.171779\n",
|
||||
"8 98.00 0.638037\n",
|
||||
"9 100.00 0.654397\n",
|
||||
"10 142.25 1.000000\n",
|
||||
"11 53.00 0.269939\n",
|
||||
"12 142.25 1.000000\n",
|
||||
"13 70.00 0.408998\n",
|
||||
"14 63.00 0.351738\n",
|
||||
"15 52.00 0.261759\n",
|
||||
"16 88.00 0.556237\n",
|
||||
"17 70.00 0.408998\n",
|
||||
"18 79.00 0.482618\n",
|
||||
"19 61.00 0.335378\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Data transformation using min-max scaling\n",
|
||||
"df['Scaled_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())\n",
|
||||
"\n",
|
||||
"print(\"DataFrame with Min-Max Scaling on 'Attendance':\")\n",
|
||||
"print(df[['Attendance', 'Scaled_Attendance']].head(20))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a4b032f-18de-48a0-82cd-3afd9f4426e8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.20"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,379 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a46c141c-0657-4870-81d2-38d814c05877",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook-A3 (Descriptive Statistics)\n",
|
||||
"\n",
|
||||
"- Measures of Central Tendency and variability\n",
|
||||
"- Dataset generated in this code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Problem Statement - Part 1 (data.csv)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import library\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate data\n",
|
||||
"data = {\n",
|
||||
" 'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
|
||||
" 'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
|
||||
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Define data in DataFrame\n",
|
||||
"df = pd.DataFrame(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" count mean std min 25% 50% \\\n",
|
||||
"age_group \n",
|
||||
"20-30 3.0 52333.333333 8736.894948 45000.0 47500.0 50000.0 \n",
|
||||
"30-40 2.0 62500.000000 3535.533906 60000.0 61250.0 62500.0 \n",
|
||||
"40-50 2.0 72500.000000 3535.533906 70000.0 71250.0 72500.0 \n",
|
||||
"50-60 3.0 85000.000000 5000.000000 80000.0 82500.0 85000.0 \n",
|
||||
"\n",
|
||||
" 75% max \n",
|
||||
"age_group \n",
|
||||
"20-30 56000.0 62000.0 \n",
|
||||
"30-40 63750.0 65000.0 \n",
|
||||
"40-50 73750.0 75000.0 \n",
|
||||
"50-60 87500.0 90000.0 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by age_group and compute summary statistics for 'income'\n",
|
||||
"summary_stats = df.groupby('age_group')['income'].describe()\n",
|
||||
"\n",
|
||||
"# Print summary\n",
|
||||
"print(summary_stats)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "7bf97efc-1450-4289-a31f-15a7a9629743",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Income by Age Group:\n",
|
||||
"age_group\n",
|
||||
"20-30 50000.0\n",
|
||||
"30-40 62500.0\n",
|
||||
"40-50 72500.0\n",
|
||||
"50-60 85000.0\n",
|
||||
"Name: income, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
|
||||
"median_income = df.groupby('age_group')['income'].median()\n",
|
||||
"\n",
|
||||
"# Print dat median\n",
|
||||
"print(\"Median Income by Age Group:\")\n",
|
||||
"print(median_income)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print column names\n",
|
||||
"print(\"Column Names:\", df.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "86d9558e-7e50-4982-914a-9f409b67fa19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Modified dataset with repeated values\n",
|
||||
"data = {\n",
|
||||
" 'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
|
||||
" 'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
|
||||
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Define data in DataFrame\n",
|
||||
"df = pd.DataFrame(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "87006581-7ccd-4c4e-9db0-49448f651aea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Mode of Age: [25]\n",
|
||||
"Mode of Income: [50000]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Calculate the mode for each column\n",
|
||||
"mode_age = df['age'].mode()\n",
|
||||
"mode_income = df['income'].mode()\n",
|
||||
"print(f\"Mode of Age: {mode_age.values}\")\n",
|
||||
"print(f\"Mode of Income: {mode_income.values}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Problem Statment - Part 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||||
"0 5.1 3.5 1.4 0.2 Setosa\n",
|
||||
"1 4.9 3.0 1.4 0.2 Setosa\n",
|
||||
"2 4.7 3.2 1.3 0.2 Setosa\n",
|
||||
"3 4.6 3.1 1.5 0.2 Setosa\n",
|
||||
"4 5.0 3.6 1.4 0.2 Setosa\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load iris.csv in the DataFrame\n",
|
||||
"df = pd.read_csv('iris.csv')\n",
|
||||
"\n",
|
||||
"print(df.head()) # Print first 5 rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "e6f74058-9c12-4f09-a376-29583480ac91",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Percentiles by Species:\n",
|
||||
" sepal.length sepal.width petal.length petal.width\n",
|
||||
"variety \n",
|
||||
"Setosa 0.25 4.800 3.200 1.400 0.2\n",
|
||||
" 0.50 5.000 3.400 1.500 0.2\n",
|
||||
" 0.75 5.200 3.675 1.575 0.3\n",
|
||||
"Versicolor 0.25 5.600 2.525 4.000 1.2\n",
|
||||
" 0.50 5.900 2.800 4.350 1.3\n",
|
||||
" 0.75 6.300 3.000 4.600 1.5\n",
|
||||
"Virginica 0.25 6.225 2.800 5.100 1.8\n",
|
||||
" 0.50 6.500 3.000 5.550 2.0\n",
|
||||
" 0.75 6.900 3.175 5.875 2.3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by species and display summary statistics\n",
|
||||
"summary_stats_species = df.groupby('variety').describe()\n",
|
||||
"\n",
|
||||
"# Compute specific percentiles and statistics\n",
|
||||
"percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
|
||||
"\n",
|
||||
"# Display summary statistics and percentiles\n",
|
||||
"summary_stats_species = df.groupby('variety').describe()\n",
|
||||
"\n",
|
||||
"print(\"\\nPercentiles by Species:\")\n",
|
||||
"print(percentiles)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "2116e46c-a150-4df9-82b9-a4684007ba59",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Summary Statistics by Species for Sepal Width:\n",
|
||||
" count mean std min 25% 50% 75% max\n",
|
||||
"variety \n",
|
||||
"Setosa 50.0 3.428 0.379064 2.3 3.200 3.4 3.675 4.4\n",
|
||||
"Versicolor 50.0 2.770 0.313798 2.0 2.525 2.8 3.000 3.4\n",
|
||||
"Virginica 50.0 2.974 0.322497 2.2 2.800 3.0 3.175 3.8\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
|
||||
"summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
|
||||
"\n",
|
||||
"print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
|
||||
"print(summary_stats_species)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "47148444-1790-46a9-93ce-dba152e58894",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Values by Species:\n",
|
||||
" sepal.length sepal.width petal.length petal.width\n",
|
||||
"variety \n",
|
||||
"Setosa 5.0 3.4 1.50 0.2\n",
|
||||
"Versicolor 5.9 2.8 4.35 1.3\n",
|
||||
"Virginica 6.5 3.0 5.55 2.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group by variety and compute the median for numeric columns\n",
|
||||
"median_values = df.groupby('variety').median()\n",
|
||||
"\n",
|
||||
"print(\"Median Values by Species:\")\n",
|
||||
"print(median_values)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Median Sepal Length by Species:\n",
|
||||
"variety\n",
|
||||
"Setosa 5.0\n",
|
||||
"Versicolor 5.9\n",
|
||||
"Virginica 6.5\n",
|
||||
"Name: sepal.length, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
|
||||
"median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
|
||||
"print(\"Median Sepal Length by Species:\")\n",
|
||||
"print(median_sepal_length)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Mode of Width: [3.]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Calculate & print mode for sepal.width\n",
|
||||
"mode_width = df['sepal.width'].mode()\n",
|
||||
"print(f\"Mode of Width: {mode_width.values}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.20"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
+300
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,308 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6ed65106-23d2-4261-bf81-2a5b4b5ec60e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook-A7 (Text Analytics)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "127a35f5-2434-4f6a-8904-edb4fb4f6f29",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import libraries\n",
|
||||
"import nltk\n",
|
||||
"from nltk.tokenize import *\n",
|
||||
"from nltk.corpus import *\n",
|
||||
"from nltk.stem import *\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "555c6d33-75bb-4033-a9fa-60e145527464",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download resources\n",
|
||||
"nltk.download('all') # WARNING: ABOUT 2GBs\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"OR YOU COULD DOWNLOAD ONLY SPECIFIC RESOURCES\n",
|
||||
"nltk.download('punkt') # For splitting text into sentences or words\n",
|
||||
"nltk.download('stopwords') # Common stop words\n",
|
||||
"nltk.download('wordnet') # Synonyms\n",
|
||||
"nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger\n",
|
||||
"nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "ba67d90c-5711-496a-bf81-a5aef68a01bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Write text to perform preprocessing on\n",
|
||||
"text = \"Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "fa8d4d18-ba91-4ced-9522-849be18aba6a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Hello everyone!', 'I am first name last name.', 'I am a loyal KSKA Git user all the way from Sangamwadi Empire.', 'I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain.', 'For every smart contract, I lose one strand of my hair.', 'In my free time, which by the way, I barely get, I like to swim.']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Sentence tokenization\n",
|
||||
"var1 = sent_tokenize(text)\n",
|
||||
"print(var1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "a53cc954-e60f-41b8-8e15-09fdc5b80328",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Hello', 'everyone', '!', 'I', 'am', 'first', 'name', 'last', 'name', '.', 'I', 'am', 'a', 'loyal', 'KSKA', 'Git', 'user', 'all', 'the', 'way', 'from', 'Sangamwadi', 'Empire', '.', 'I', 'have', 'considerable', 'knowledge', 'about', 'life', ',', 'Python', ',', 'C++', ',', 'Java', ',', 'Rust', ',', 'Golang', 'and', 'Blockchain', '.', 'For', 'every', 'smart', 'contract', ',', 'I', 'lose', 'one', 'strand', 'of', 'my', 'hair', '.', 'In', 'my', 'free', 'time', ',', 'which', 'by', 'the', 'way', ',', 'I', 'barely', 'get', ',', 'I', 'like', 'to', 'swim', '.']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Word tokenization\n",
|
||||
"var2 = word_tokenize(text)\n",
|
||||
"print(var2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "f259c7c3-9e94-42cb-bb94-a81176dc3126",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"After removing punctuation from text:\n",
|
||||
" Hello everyone I am first name last name I am a loyal KSKA Git user all the way from Sangamwadi Empire I have considerable knowledge about life Python C Java Rust Golang and Blockchain For every smart contract I lose one strand of my hair In my free time which by the way I barely get I like to swim \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Removing punctuation\n",
|
||||
"text = re.sub('[^a-zA-Z]',' ',text)\n",
|
||||
"print(\"After removing punctuation from text:\\n\", text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "092b3e63-3161-4b23-be8d-ff83a829205f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Stop words:\n",
|
||||
" {'do', 'most', 'more', 'am', 'aren', 'other', \"shouldn't\", 's', 'now', 'again', 'here', 'off', \"we're\", 'during', 'haven', 'above', \"we've\", 'our', \"he'll\", 'whom', 'ain', 'is', \"she'll\", 'once', \"that'll\", \"needn't\", 'shan', 'weren', 'been', 'doing', 'wasn', 'needn', 'any', 'not', \"aren't\", \"won't\", 'myself', 'couldn', 'by', 'were', 'no', \"he's\", \"shan't\", 'very', \"i'd\", 'y', 'm', 'your', 'against', 'are', 'she', \"hasn't\", \"she'd\", \"you'll\", 'because', 'mightn', 'their', \"they'd\", 'nor', 'having', 'into', 'so', \"it's\", 'don', 'who', \"haven't\", 'his', 'what', 'why', 'we', 'i', \"i'm\", 'hadn', 'over', 'and', 'her', 'to', 'ma', 'a', 'it', \"isn't\", 'under', 'o', 'until', 'an', 'same', 'them', 'did', \"they're\", 'ourselves', 'as', 'its', \"wasn't\", 'doesn', 'just', 'yourselves', 'll', 'down', 'itself', \"i've\", 'should', 'shouldn', \"mightn't\", 'on', 'these', 'or', 'only', 'd', 'hasn', 'about', 'wouldn', \"couldn't\", 're', 'mustn', 'with', \"you'd\", 'few', 'in', 'the', 'out', \"don't\", 'him', \"wouldn't\", 'can', 'through', 'from', 'those', 'for', 'didn', 'you', 'below', 'up', 'themselves', \"didn't\", 'too', 'being', 'of', 'further', 'some', \"we'd\", \"i'll\", \"it'll\", 'while', \"doesn't\", \"mustn't\", 'that', 've', 'if', 'be', 'yourself', 'he', \"hadn't\", 'how', 'than', 'was', 'will', 'before', 'my', 't', 'theirs', 'at', \"weren't\", \"should've\", 'won', \"you're\", 'own', 'isn', \"you've\", 'such', 'himself', \"she's\", 'all', 'me', 'but', \"they'll\", \"he'd\", 'after', \"they've\", 'then', 'this', 'both', 'hers', 'herself', 'ours', \"it'd\", 'which', 'where', \"we'll\", 'each', 'between', 'there', 'yours', 'had', 'have', 'has', 'when', 'does', 'they'}\n",
|
||||
"==============================================================\n",
|
||||
"Tokenized Sentence:\n",
|
||||
" ['hello', 'everyone', 'i', 'am', 'first', 'name', 'last', 'name', 'i', 'am', 'a', 'loyal', 'kska', 'git', 'user', 'all', 'the', 'way', 'from', 'sangamwadi', 'empire', 'i', 'have', 'considerable', 'knowledge', 'about', 'life', 'python', 'c', 'java', 'rust', 'golang', 'and', 'blockchain', 'for', 'every', 'smart', 'contract', 'i', 'lose', 'one', 'strand', 'of', 'my', 'hair', 'in', 'my', 'free', 'time', 'which', 'by', 'the', 'way', 'i', 'barely', 'get', 'i', 'like', 'to', 'swim']\n",
|
||||
"\n",
|
||||
"Filtered Sentence:\n",
|
||||
" ['hello', 'everyone', 'first', 'name', 'last', 'name', 'loyal', 'kska', 'git', 'user', 'way', 'sangamwadi', 'empire', 'considerable', 'knowledge', 'life', 'python', 'c', 'java', 'rust', 'golang', 'blockchain', 'every', 'smart', 'contract', 'lose', 'one', 'strand', 'hair', 'free', 'time', 'way', 'barely', 'get', 'like', 'swim']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Removing stop words\n",
|
||||
"var3 = set(stopwords.words('english'))\n",
|
||||
"print(\"Stop words:\\n\", var3)\n",
|
||||
"print(\"==============================================================\")\n",
|
||||
"tokens = word_tokenize(text.lower())\n",
|
||||
"filtered_text = []\n",
|
||||
"for word in tokens:\n",
|
||||
" if word not in var3:\n",
|
||||
" filtered_text.append(word)\n",
|
||||
"print(\"Tokenized Sentence:\\n\", tokens)\n",
|
||||
"print(\"\\nFiltered Sentence:\\n\", filtered_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "1b6d55c9-5724-4abb-bcb2-4fc5d27cbe12",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"write\n",
|
||||
"write\n",
|
||||
"wrote\n",
|
||||
"write\n",
|
||||
"read\n",
|
||||
"read\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Stemmatization\n",
|
||||
"var = [\"write\", \"writing\", \"wrote\", \"writes\",\"reading\",\"reads\"]\n",
|
||||
"ps = PorterStemmer() # brings word to its root form\n",
|
||||
"for w in var:\n",
|
||||
" root_word = ps.stem(w)\n",
|
||||
" print(root_word)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "6e8f62c1-d4ae-48a8-8d3e-a86366ed7972",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Text is:\t ['studies', 'studying', 'cries', 'cry']\n",
|
||||
"Lemma for studies is study\n",
|
||||
"Lemma for studying is studying\n",
|
||||
"Lemma for cries is cry\n",
|
||||
"Lemma for cry is cry\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Lemmatization\n",
|
||||
"wordnet_lemmatizer = WordNetLemmatizer()\n",
|
||||
"text = \"studies studying cries cry\"\n",
|
||||
"tt = nltk.word_tokenize(text)\n",
|
||||
"print(\"Text is:\\t\", tt)\n",
|
||||
"for w in tt:\n",
|
||||
" print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "c2ff8017-d03a-412f-b1f8-e2fb0b70bfca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('Hello', 'NNP'),\n",
|
||||
" ('everyone', 'NN'),\n",
|
||||
" ('this', 'DT'),\n",
|
||||
" ('is', 'VBZ'),\n",
|
||||
" ('a', 'DT'),\n",
|
||||
" ('sample', 'JJ'),\n",
|
||||
" ('text', 'NN'),\n",
|
||||
" ('!', '.'),\n",
|
||||
" ('Earth', 'NN'),\n",
|
||||
" ('.', '.')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# POS Tagging\n",
|
||||
"text = \"Hello everyone this is a sample text! Earth.\"\n",
|
||||
"text = nltk.word_tokenize(text)\n",
|
||||
"nltk.pos_tag(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "1d71007a-a8cb-45ab-af27-ec69b6826ddd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF\n",
|
||||
"['an' 'example' 'frequency' 'is' 'meow' 'of' 'term' 'this'] [[0.1767767 0.1767767 0.1767767 0.1767767 0.88388348 0.1767767\n",
|
||||
" 0.1767767 0.1767767 ]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# TF-IDF (Term Frequency & Inverse Document Frequency)\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"new_sentence = \"This is an example of term frequency. Meow meow meow meow meow!\"\n",
|
||||
"\n",
|
||||
"def calculate_tfIdf(document):\n",
|
||||
" tokenizer = TfidfVectorizer()\n",
|
||||
" tf_matrix = tokenizer.fit_transform(document)\n",
|
||||
" features_names = tokenizer.get_feature_names_out()\n",
|
||||
" return tf_matrix, features_names\n",
|
||||
"\n",
|
||||
"# Wrap the new_sentence in a list\n",
|
||||
"document = [new_sentence]\n",
|
||||
"tf_matrix, feature_names = calculate_tfIdf(document)\n",
|
||||
"\n",
|
||||
"print('TF-IDF')\n",
|
||||
"print(feature_names, tf_matrix.toarray())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b34b21ba-46d6-4bad-b001-aa9962cc17b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.20"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user