Added codes, datasets and Jupyter notebooks directory.

This commit is contained in:
K
2025-06-11 13:48:53 +05:30
parent b3a22e9b79
commit 76dc1de8db
32 changed files with 8930 additions and 0 deletions
@@ -0,0 +1,571 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5ed97617-c927-48b4-857c-c4d75b98fe2c",
"metadata": {},
"source": [
"# Notebook-A1 (Data Wrangling-1)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a93305d3-03c9-479e-bde3-fb0d06ea8d39",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8bf62160-0384-4a3f-b25e-11e28c8b8df4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal.length</th>\n",
" <th>sepal.width</th>\n",
" <th>petal.length</th>\n",
" <th>petal.width</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>5.843333</td>\n",
" <td>3.057333</td>\n",
" <td>3.758000</td>\n",
" <td>1.199333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.828066</td>\n",
" <td>0.435866</td>\n",
" <td>1.765298</td>\n",
" <td>0.762238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>4.300000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.100000</td>\n",
" <td>2.800000</td>\n",
" <td>1.600000</td>\n",
" <td>0.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>5.800000</td>\n",
" <td>3.000000</td>\n",
" <td>4.350000</td>\n",
" <td>1.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>6.400000</td>\n",
" <td>3.300000</td>\n",
" <td>5.100000</td>\n",
" <td>1.800000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>7.900000</td>\n",
" <td>4.400000</td>\n",
" <td>6.900000</td>\n",
" <td>2.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sepal.length sepal.width petal.length petal.width\n",
"count 150.000000 150.000000 150.000000 150.000000\n",
"mean 5.843333 3.057333 3.758000 1.199333\n",
"std 0.828066 0.435866 1.765298 0.762238\n",
"min 4.300000 2.000000 1.000000 0.100000\n",
"25% 5.100000 2.800000 1.600000 0.300000\n",
"50% 5.800000 3.000000 4.350000 1.300000\n",
"75% 6.400000 3.300000 5.100000 1.800000\n",
"max 7.900000 4.400000 6.900000 2.500000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the dataset into DataFrame\n",
"df=pd.read_csv('iris.csv')\n",
"df.describe() # Print description of DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7c813f91-5100-463a-a848-2ef9f46344bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 values:\n",
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"2 4.7 3.2 1.3 0.2 Setosa\n",
"3 4.6 3.1 1.5 0.2 Setosa\n",
"4 5.0 3.6 1.4 0.2 Setosa\n",
"Last 5 values:\n",
" sepal.length sepal.width petal.length petal.width variety\n",
"145 6.7 3.0 5.2 2.3 Virginica\n",
"146 6.3 2.5 5.0 1.9 Virginica\n",
"147 6.5 3.0 5.2 2.0 Virginica\n",
"148 6.2 3.4 5.4 2.3 Virginica\n",
"149 5.9 3.0 5.1 1.8 Virginica\n"
]
}
],
"source": [
"# Print first and last 5 values\n",
"print(\"First 5 values:\\n\", df.head())\n",
"print (\"Last 5 values:\\n\", df.tail())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "78fa6d44-7c39-4306-a5d7-0fd5ff94243a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"145 False\n",
"146 False\n",
"147 False\n",
"148 False\n",
"149 False\n",
"Length: 150, dtype: bool"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print duplicated values\n",
"df.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5da58b1d-c458-4eb5-b23c-053c02934efd",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal.length</th>\n",
" <th>sepal.width</th>\n",
" <th>petal.length</th>\n",
" <th>petal.width</th>\n",
" <th>variety</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>150 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 False False False False False\n",
"1 False False False False False\n",
"2 False False False False False\n",
"3 False False False False False\n",
"4 False False False False False\n",
".. ... ... ... ... ...\n",
"145 False False False False False\n",
"146 False False False False False\n",
"147 False False False False False\n",
"148 False False False False False\n",
"149 False False False False False\n",
"\n",
"[150 rows x 5 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print null values true/false\n",
"df.isnull()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3b6face6-d366-4a05-9fd4-baff133e24f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 150 entries, 0 to 149\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sepal.length 150 non-null float64\n",
" 1 sepal.width 150 non-null float64\n",
" 2 petal.length 150 non-null float64\n",
" 3 petal.width 150 non-null float64\n",
" 4 variety 150 non-null object \n",
"dtypes: float64(4), object(1)\n",
"memory usage: 6.0+ KB\n"
]
}
],
"source": [
"# Print summary of DataFrame\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f08a00a4-c82a-49a9-bbaa-427d5cc4db96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150, 5)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print shape, i.e. rows + columns\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "186866fe-7614-4a4d-b929-31bd37f80027",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"145 False\n",
"146 False\n",
"147 False\n",
"148 False\n",
"149 False\n",
"Name: sepal.length, Length: 150, dtype: bool"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print null (true/false) values in sepal.length column\n",
"df[\"sepal.length\"].isnull()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5bc1e082-9818-450d-b76d-5dd3a2103e00",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.width variety\n",
"0 5.1 3.5 0.2 Setosa\n",
"1 4.9 3.0 0.2 Setosa\n",
"2 4.7 3.2 0.2 Setosa\n",
"3 4.6 3.1 0.2 Setosa\n",
"4 5.0 3.6 0.2 Setosa\n",
".. ... ... ... ...\n",
"145 6.7 3.0 2.3 Virginica\n",
"146 6.3 2.5 1.9 Virginica\n",
"147 6.5 3.0 2.0 Virginica\n",
"148 6.2 3.4 2.3 Virginica\n",
"149 5.9 3.0 1.8 Virginica\n",
"\n",
"[150 rows x 4 columns]\n"
]
}
],
"source": [
"# Delete/Drop petal.length column\n",
"y = df.drop([\"petal.length\"], axis=1) # axis=1 column. For row, axis=0\n",
"print(y)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "dcccff24-c97e-4832-9cc1-cd701a3a9a34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 0\n",
"1 4.9 3.0 1.4 0.2 0\n",
"2 4.7 3.2 1.3 0.2 0\n",
"3 4.6 3.1 1.5 0.2 0\n",
"4 5.0 3.6 1.4 0.2 0\n",
".. ... ... ... ... ...\n",
"145 6.7 3.0 5.2 2.3 1\n",
"146 6.3 2.5 5.0 1.9 1\n",
"147 6.5 3.0 5.2 2.0 1\n",
"148 6.2 3.4 5.4 2.3 1\n",
"149 5.9 3.0 5.1 1.8 1\n",
"\n",
"[150 rows x 5 columns]\n"
]
}
],
"source": [
"# In variety column, replace Setosa with 0 and Virginica with 1\n",
"df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "97d22793-657e-4df7-a7f1-01d70886ef57",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sepal.length 0\n",
"sepal.width 0\n",
"petal.length 0\n",
"petal.width 0\n",
"variety 0\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print sum of NULL values in each column\n",
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"id": "e3f0c5c4-a930-4dbc-8300-99fb7cb7c991",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
@@ -0,0 +1,983 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6ce3d040-a50f-459c-b494-e172f2897780",
"metadata": {},
"source": [
"# Notebook-A2 (Data Wrangling-2)\n",
"\n",
"- “Academic performance” wala\n",
"- Dataset generated here, not imported"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ce0d3ca6-fec0-4d3f-82b5-8ef92256525a",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"# import pandas as shriniwas\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "24b47cb9-c955-4325-ad62-4b215d73398c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nIF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\\n\\ndata = {\\n 'Student_id': [1,2,3,4,5,6,7,8,9,10],\\n 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\\n 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\\n 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\\n 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\\n [58, 96, 61]],\\n 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\\n 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\\n}\\n\""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Generate data\n",
"np.random.seed(50) #for consistency\n",
"\n",
"data = {\n",
" 'Student_id': range(1, 51),\n",
" 'Name': ['Student_' + str(i) for i in range(1, 51)],\n",
" 'Age': np.random.randint(18, 25, size=50),\n",
" 'Gender': np.random.choice(['Male', 'Female'], size=50),\n",
" 'Scores': [np.random.randint(50, 100, size=3).tolist() for _ in range(50)],\n",
" 'Attendance': np.random.randint(20,100,size=50),\n",
" 'Grade': np.random.choice(['A', 'B', 'C', 'D', 'F'], size=50)\n",
"}\n",
"\n",
"\"\"\"\n",
"IF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\n",
"\n",
"data = {\n",
" 'Student_id': [1,2,3,4,5,6,7,8,9,10],\n",
" 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\n",
" 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\n",
" 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\n",
" 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\n",
" [58, 96, 61]],\n",
" 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\n",
" 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "ba84d792-8cff-4a94-b936-6c6ae0bd8527",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19 Male [88, 77, 78] 32 F"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import data into DataFrame\n",
"df = pd.DataFrame(data)\n",
"df.head() # Print first 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "8d9d0a15-5a81-4b3d-90de-0efd631b0ec8",
"metadata": {},
"outputs": [],
"source": [
"# Assign grades\n",
"def assign_grade(scores):\n",
" avg_score = np.mean(scores)\n",
"\n",
" if avg_score > 90:\n",
" return 'A'\n",
" elif avg_score > 80:\n",
" return 'B'\n",
" elif avg_score > 70:\n",
" return 'C'\n",
" elif avg_score > 60:\n",
" return 'D'\n",
" else:\n",
" return 'F'\n",
"\n",
"df['Grade'] = df['Scores'].apply(assign_grade)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b2480d57-75c9-48d3-94ec-5b3f0a459007",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18.0</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18.0</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23.0</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19.0</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Student_6</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[81, 90, 65]</td>\n",
" <td>96</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Student_7</td>\n",
" <td>22.0</td>\n",
" <td>Female</td>\n",
" <td>[55, 97, 54]</td>\n",
" <td>73</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Student_8</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[54, 68, 97]</td>\n",
" <td>41</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Student_9</td>\n",
" <td>NaN</td>\n",
" <td>Male</td>\n",
" <td>[92, 67, 76]</td>\n",
" <td>98</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>Student_10</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[58, 96, 61]</td>\n",
" <td>105</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>Student_11</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[77, 77, 57]</td>\n",
" <td>65</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>Student_12</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>None</td>\n",
" <td>53</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>Student_13</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[85, 53, 71]</td>\n",
" <td>74</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>Student_14</td>\n",
" <td>20.0</td>\n",
" <td>Female</td>\n",
" <td>[92, 53, 56]</td>\n",
" <td>70</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>Student_15</td>\n",
" <td>20.0</td>\n",
" <td>Male</td>\n",
" <td>[65, 81, 72]</td>\n",
" <td>63</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>Student_16</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[50, 61, 80]</td>\n",
" <td>52</td>\n",
" <td>Z</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>Student_17</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[83, 99, 64]</td>\n",
" <td>88</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>Student_18</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[76, 72, 96]</td>\n",
" <td>70</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>Student_19</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[87, 56, 80]</td>\n",
" <td>79</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>Student_20</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>None</td>\n",
" <td>61</td>\n",
" <td>C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 F\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 D\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 NaN Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 D\n",
"11 12 Student_12 23.0 Male None 53 A\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 C\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 A\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 D\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 C\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 D\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 B\n",
"19 20 Student_20 21.0 Male None 61 C"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Introduce missing + invalid values and inconsistencies\n",
"df = pd.DataFrame(data)\n",
"df.loc[8, 'Age'] = np.nan\n",
"df.loc[29, 'Age'] = np.nan\n",
"df.loc[35, 'Age'] = np.nan\n",
"df.loc[11, 'Scores'] = None\n",
"df.loc[19, 'Scores'] = None\n",
"df.loc[9, 'Attendance'] = 105 # invalid percentage\n",
"df.loc[15, 'Grade'] = 'Z' # invalid grade\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "629f27c9-bca3-404f-8c8f-fae7a3882db8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values:\n",
" Student_id 0\n",
"Name 0\n",
"Age 3\n",
"Gender 0\n",
"Scores 2\n",
"Attendance 0\n",
"Grade 0\n",
"dtype: int64\n",
"Invalid attendance:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"Invalid grades:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n"
]
}
],
"source": [
"# Locating & printing missing/invalid values\n",
"missing_values = df.isnull().sum() #check missing values\n",
"invalid_attendance = df[(df['Attendance'] < 0) | (df['Attendance'] > 100)]\n",
"invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]\n",
"\n",
"print(\"Missing values:\\n\", missing_values)\n",
"print(\"Invalid attendance:\\n\", invalid_attendance)\n",
"print(\"Invalid grades:\\n\", invalid_grades)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "4e46dfbe-693b-4f3b-a9a3-a0d243cd5214",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18.0</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18.0</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23.0</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19.0</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Student_6</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[81, 90, 65]</td>\n",
" <td>96</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Student_7</td>\n",
" <td>22.0</td>\n",
" <td>Female</td>\n",
" <td>[55, 97, 54]</td>\n",
" <td>73</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Student_8</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[54, 68, 97]</td>\n",
" <td>41</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Student_9</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>[92, 67, 76]</td>\n",
" <td>98</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>Student_10</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[58, 96, 61]</td>\n",
" <td>100</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>Student_11</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[77, 77, 57]</td>\n",
" <td>65</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>Student_12</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[0, 0, 0]</td>\n",
" <td>53</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>Student_13</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[85, 53, 71]</td>\n",
" <td>74</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>Student_14</td>\n",
" <td>20.0</td>\n",
" <td>Female</td>\n",
" <td>[92, 53, 56]</td>\n",
" <td>70</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>Student_15</td>\n",
" <td>20.0</td>\n",
" <td>Male</td>\n",
" <td>[65, 81, 72]</td>\n",
" <td>63</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>Student_16</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[50, 61, 80]</td>\n",
" <td>52</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>Student_17</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[83, 99, 64]</td>\n",
" <td>88</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>Student_18</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[76, 72, 96]</td>\n",
" <td>70</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>Student_19</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[87, 56, 80]</td>\n",
" <td>79</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>Student_20</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>[0, 0, 0]</td>\n",
" <td>61</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 D\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 B\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 B\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 A\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 B\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Handling missing/invalid values\n",
"df['Age'] = df['Age'].fillna(df['Age'].median()) #fill by median\n",
"df['Attendance'] = df['Attendance'].apply(lambda x: 100 if x > 100 else (0 if x < 0 else x))\n",
"\n",
"def handle_invalid_scores(scores):\n",
" if scores is None:\n",
" return [0, 0, 0]\n",
"\n",
" return [max(0, min(100, score)) for score in scores]\n",
"\n",
"df['Scores'] = df['Scores'].apply(handle_invalid_scores)\n",
"df['Grade'] = df['Scores'].apply(assign_grade)\n",
"df['Grade'] = df['Grade'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'F'] else 'F')\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "f7cfc3bb-91c4-4fa8-b723-ccd786cd8626",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Outliers:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 65.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 200 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 166 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F\n"
]
}
],
"source": [
"# Adding outiers\n",
"df.loc[5, 'Age'] = 35\n",
"df.loc[5, 'Age'] = 50\n",
"df.loc[5, 'Age'] = 65\n",
"df.loc[10, 'Attendance'] = 200\n",
"df.loc[12, 'Attendance'] = 175\n",
"df.loc[12, 'Attendance'] = 166\n",
"\n",
"print(\"DataFrame with Outliers:\")\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "92e0593b-99df-4e08-a8da-ee923936cb91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 26.875 Male [81, 90, 65] 96.00 C\n",
"6 7 Student_7 22.000 Female [55, 97, 54] 73.00 D\n",
"7 8 Student_8 24.000 Male [54, 68, 97] 41.00 C\n",
"8 9 Student_9 21.000 Male [92, 67, 76] 98.00 C\n",
"9 10 Student_10 24.000 Female [58, 96, 61] 100.00 C\n",
"10 11 Student_11 24.000 Female [77, 77, 57] 142.25 C\n",
"11 12 Student_12 23.000 Male [0, 0, 0] 53.00 F\n",
"12 13 Student_13 23.000 Male [85, 53, 71] 142.25 D\n",
"13 14 Student_14 20.000 Female [92, 53, 56] 70.00 D\n",
"14 15 Student_15 20.000 Male [65, 81, 72] 63.00 C\n",
"15 16 Student_16 22.000 Male [50, 61, 80] 52.00 D\n",
"16 17 Student_17 24.000 Female [83, 99, 64] 88.00 B\n",
"17 18 Student_18 21.000 Female [76, 72, 96] 70.00 B\n",
"18 19 Student_19 22.000 Male [87, 56, 80] 79.00 C\n",
"19 20 Student_20 21.000 Male [0, 0, 0] 61.00 F\n"
]
}
],
"source": [
"# Handling outliers\n",
"def handle_outliers_iqr(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
"\n",
" IQR = Q3 - Q1\n",
"\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
"\n",
" df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))\n",
"\n",
"handle_outliers_iqr(df, 'Age')\n",
"handle_outliers_iqr(df, 'Attendance')\n",
"\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9866733a-c095-402d-b6b0-0fd88ef31169",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Min-Max Scaling on 'Attendance':\n",
" Attendance Scaled_Attendance\n",
"0 55.00 0.286299\n",
"1 23.00 0.024540\n",
"2 84.00 0.523517\n",
"3 66.00 0.376278\n",
"4 32.00 0.098160\n",
"5 96.00 0.621677\n",
"6 73.00 0.433538\n",
"7 41.00 0.171779\n",
"8 98.00 0.638037\n",
"9 100.00 0.654397\n",
"10 142.25 1.000000\n",
"11 53.00 0.269939\n",
"12 142.25 1.000000\n",
"13 70.00 0.408998\n",
"14 63.00 0.351738\n",
"15 52.00 0.261759\n",
"16 88.00 0.556237\n",
"17 70.00 0.408998\n",
"18 79.00 0.482618\n",
"19 61.00 0.335378\n"
]
}
],
"source": [
"# Data transformation using min-max scaling\n",
"df['Scaled_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())\n",
"\n",
"print(\"DataFrame with Min-Max Scaling on 'Attendance':\")\n",
"print(df[['Attendance', 'Scaled_Attendance']].head(20))"
]
},
{
"cell_type": "markdown",
"id": "8a4b032f-18de-48a0-82cd-3afd9f4426e8",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -0,0 +1,379 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a46c141c-0657-4870-81d2-38d814c05877",
"metadata": {},
"source": [
"# Notebook-A3 (Descriptive Statistics)\n",
"\n",
"- Measures of Central Tendency and variability\n",
"- Dataset generated in this code"
]
},
{
"cell_type": "markdown",
"id": "f7cf5345-dd87-41be-adb6-6fcec33f9255",
"metadata": {},
"source": [
"## Problem Statement - Part 1 (data.csv)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "997254c8-a0c4-4cc6-a360-9a9534e89f4c",
"metadata": {},
"outputs": [],
"source": [
"# Import library\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3326aae8-b9c4-448a-83f6-6060f84c7749",
"metadata": {},
"outputs": [],
"source": [
"# Generate data\n",
"data = {\n",
" 'age': [25, 30, 22, 40, 55, 60, 33, 28, 45, 50],\n",
" 'income': [50000, 60000, 45000, 70000, 80000, 90000, 65000, 62000, 75000, 85000],\n",
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60']\n",
"}\n",
"\n",
"# Define data in DataFrame\n",
"df = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "31afc8b3-a7ef-492e-bebc-4a079cc6b402",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" count mean std min 25% 50% \\\n",
"age_group \n",
"20-30 3.0 52333.333333 8736.894948 45000.0 47500.0 50000.0 \n",
"30-40 2.0 62500.000000 3535.533906 60000.0 61250.0 62500.0 \n",
"40-50 2.0 72500.000000 3535.533906 70000.0 71250.0 72500.0 \n",
"50-60 3.0 85000.000000 5000.000000 80000.0 82500.0 85000.0 \n",
"\n",
" 75% max \n",
"age_group \n",
"20-30 56000.0 62000.0 \n",
"30-40 63750.0 65000.0 \n",
"40-50 73750.0 75000.0 \n",
"50-60 87500.0 90000.0 \n"
]
}
],
"source": [
"# Group the data by age_group and compute summary statistics for 'income'\n",
"summary_stats = df.groupby('age_group')['income'].describe()\n",
"\n",
"# Print summary\n",
"print(summary_stats)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7bf97efc-1450-4289-a31f-15a7a9629743",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Income by Age Group:\n",
"age_group\n",
"20-30 50000.0\n",
"30-40 62500.0\n",
"40-50 72500.0\n",
"50-60 85000.0\n",
"Name: income, dtype: float64\n"
]
}
],
"source": [
"# Group the data by age_group; Select income column for each of the groups created; Calculate median for income\n",
"median_income = df.groupby('age_group')['income'].median()\n",
"\n",
"# Print dat median\n",
"print(\"Median Income by Age Group:\")\n",
"print(median_income)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3e69e1ef-9bdb-4ca9-8408-aa3cd09bcb64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column Names: Index(['age', 'income', 'age_group'], dtype='object')\n"
]
}
],
"source": [
"# Print column names\n",
"print(\"Column Names:\", df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "86d9558e-7e50-4982-914a-9f409b67fa19",
"metadata": {},
"outputs": [],
"source": [
"# Modified dataset with repeated values\n",
"data = {\n",
" 'age': [25, 30, 25, 40, 55, 60, 33, 28, 45, 50, 25, 30, 28, 30, 25],\n",
" 'income': [50000, 60000, 50000, 70000, 80000, 90000, 65000, 62000, 75000, 85000, 50000, 60000, 62000, 70000, 75000],\n",
" 'age_group': ['20-30', '30-40', '20-30', '40-50', '50-60', '50-60', '30-40', '20-30', '40-50', '50-60', '20-30', '30-40', '20-30', '30-40', '20-30']\n",
"}\n",
"\n",
"# Define data in DataFrame\n",
"df = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "87006581-7ccd-4c4e-9db0-49448f651aea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mode of Age: [25]\n",
"Mode of Income: [50000]\n"
]
}
],
"source": [
"# Calculate the mode for each column\n",
"mode_age = df['age'].mode()\n",
"mode_income = df['income'].mode()\n",
"print(f\"Mode of Age: {mode_age.values}\")\n",
"print(f\"Mode of Income: {mode_income.values}\")"
]
},
{
"cell_type": "markdown",
"id": "43a96895-7a33-40d9-bc8e-e0a3c7f140f5",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"id": "7db076ac-079b-478a-b41c-c972ba2ca0b4",
"metadata": {},
"source": [
"## Problem Statment - Part 2"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5cd29330-9fcc-4023-a7c0-d791a19172eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"2 4.7 3.2 1.3 0.2 Setosa\n",
"3 4.6 3.1 1.5 0.2 Setosa\n",
"4 5.0 3.6 1.4 0.2 Setosa\n"
]
}
],
"source": [
"# Load iris.csv in the DataFrame\n",
"df = pd.read_csv('iris.csv')\n",
"\n",
"print(df.head()) # Print first 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "e6f74058-9c12-4f09-a376-29583480ac91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Percentiles by Species:\n",
" sepal.length sepal.width petal.length petal.width\n",
"variety \n",
"Setosa 0.25 4.800 3.200 1.400 0.2\n",
" 0.50 5.000 3.400 1.500 0.2\n",
" 0.75 5.200 3.675 1.575 0.3\n",
"Versicolor 0.25 5.600 2.525 4.000 1.2\n",
" 0.50 5.900 2.800 4.350 1.3\n",
" 0.75 6.300 3.000 4.600 1.5\n",
"Virginica 0.25 6.225 2.800 5.100 1.8\n",
" 0.50 6.500 3.000 5.550 2.0\n",
" 0.75 6.900 3.175 5.875 2.3\n"
]
}
],
"source": [
"# Group the data by species and display summary statistics\n",
"summary_stats_species = df.groupby('variety').describe()\n",
"\n",
"# Compute specific percentiles and statistics\n",
"percentiles = df.groupby('variety').quantile([0.25, 0.5, 0.75])\n",
"\n",
"# Display summary statistics and percentiles\n",
"summary_stats_species = df.groupby('variety').describe()\n",
"\n",
"print(\"\\nPercentiles by Species:\")\n",
"print(percentiles)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2116e46c-a150-4df9-82b9-a4684007ba59",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Summary Statistics by Species for Sepal Width:\n",
" count mean std min 25% 50% 75% max\n",
"variety \n",
"Setosa 50.0 3.428 0.379064 2.3 3.200 3.4 3.675 4.4\n",
"Versicolor 50.0 2.770 0.313798 2.0 2.525 2.8 3.000 3.4\n",
"Virginica 50.0 2.974 0.322497 2.2 2.800 3.0 3.175 3.8\n"
]
}
],
"source": [
"# Group the data by variety; Select sepal.width column for each of the groups created; Display summary statistics\n",
"summary_stats_species = df.groupby('variety')['sepal.width'].describe()\n",
"\n",
"print(\"\\nSummary Statistics by Species for Sepal Width:\")\n",
"print(summary_stats_species)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47148444-1790-46a9-93ce-dba152e58894",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Values by Species:\n",
" sepal.length sepal.width petal.length petal.width\n",
"variety \n",
"Setosa 5.0 3.4 1.50 0.2\n",
"Versicolor 5.9 2.8 4.35 1.3\n",
"Virginica 6.5 3.0 5.55 2.0\n"
]
}
],
"source": [
"# Group by variety and compute the median for numeric columns\n",
"median_values = df.groupby('variety').median()\n",
"\n",
"print(\"Median Values by Species:\")\n",
"print(median_values)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3adfcaa8-d3e2-483d-9715-4d1a95cd2da9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Median Sepal Length by Species:\n",
"variety\n",
"Setosa 5.0\n",
"Versicolor 5.9\n",
"Virginica 6.5\n",
"Name: sepal.length, dtype: float64\n"
]
}
],
"source": [
"# Group the data by variety; Select sepal.width column for each of the groups created; Display median\n",
"median_sepal_length = df.groupby('variety')['sepal.length'].median()\n",
"print(\"Median Sepal Length by Species:\")\n",
"print(median_sepal_length)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4a27f1b8-91d0-43a6-8593-983ddf5f9c58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mode of Width: [3.]\n"
]
}
],
"source": [
"# Calculate & print mode for sepal.width\n",
"mode_width = df['sepal.width'].mode()\n",
"print(f\"Mode of Width: {mode_width.values}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,308 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6ed65106-23d2-4261-bf81-2a5b4b5ec60e",
"metadata": {},
"source": [
"# Notebook-A7 (Text Analytics)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "127a35f5-2434-4f6a-8904-edb4fb4f6f29",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import nltk\n",
"from nltk.tokenize import *\n",
"from nltk.corpus import *\n",
"from nltk.stem import *\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "555c6d33-75bb-4033-a9fa-60e145527464",
"metadata": {},
"outputs": [],
"source": [
"# Download resources\n",
"nltk.download('all') # WARNING: ABOUT 2GBs\n",
"\n",
"\"\"\"\n",
"OR YOU COULD DOWNLOAD ONLY SPECIFIC RESOURCES\n",
"nltk.download('punkt') # For splitting text into sentences or words\n",
"nltk.download('stopwords') # Common stop words\n",
"nltk.download('wordnet') # Synonyms\n",
"nltk.download('averaged_perceptron_tagger') # part-of-speech (POS) tagger\n",
"nltk.download('punkt_tab') # For tokenizing text that is formatted in tabular form\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ba67d90c-5711-496a-bf81-a5aef68a01bd",
"metadata": {},
"outputs": [],
"source": [
"# Write text to perform preprocessing on\n",
"text = \"Hello everyone! I am first name last name. I am a loyal KSKA Git user all the way from Sangamwadi Empire. I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain. For every smart contract, I lose one strand of my hair. In my free time, which by the way, I barely get, I like to swim.\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fa8d4d18-ba91-4ced-9522-849be18aba6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Hello everyone!', 'I am first name last name.', 'I am a loyal KSKA Git user all the way from Sangamwadi Empire.', 'I have considerable knowledge about life, Python, C++, Java, Rust, Golang and Blockchain.', 'For every smart contract, I lose one strand of my hair.', 'In my free time, which by the way, I barely get, I like to swim.']\n"
]
}
],
"source": [
"# Sentence tokenization\n",
"var1 = sent_tokenize(text)\n",
"print(var1)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a53cc954-e60f-41b8-8e15-09fdc5b80328",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Hello', 'everyone', '!', 'I', 'am', 'first', 'name', 'last', 'name', '.', 'I', 'am', 'a', 'loyal', 'KSKA', 'Git', 'user', 'all', 'the', 'way', 'from', 'Sangamwadi', 'Empire', '.', 'I', 'have', 'considerable', 'knowledge', 'about', 'life', ',', 'Python', ',', 'C++', ',', 'Java', ',', 'Rust', ',', 'Golang', 'and', 'Blockchain', '.', 'For', 'every', 'smart', 'contract', ',', 'I', 'lose', 'one', 'strand', 'of', 'my', 'hair', '.', 'In', 'my', 'free', 'time', ',', 'which', 'by', 'the', 'way', ',', 'I', 'barely', 'get', ',', 'I', 'like', 'to', 'swim', '.']\n"
]
}
],
"source": [
"# Word tokenization\n",
"var2 = word_tokenize(text)\n",
"print(var2)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f259c7c3-9e94-42cb-bb94-a81176dc3126",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"After removing punctuation from text:\n",
" Hello everyone I am first name last name I am a loyal KSKA Git user all the way from Sangamwadi Empire I have considerable knowledge about life Python C Java Rust Golang and Blockchain For every smart contract I lose one strand of my hair In my free time which by the way I barely get I like to swim \n"
]
}
],
"source": [
"# Removing punctuation\n",
"text = re.sub('[^a-zA-Z]',' ',text)\n",
"print(\"After removing punctuation from text:\\n\", text)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "092b3e63-3161-4b23-be8d-ff83a829205f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Stop words:\n",
" {'do', 'most', 'more', 'am', 'aren', 'other', \"shouldn't\", 's', 'now', 'again', 'here', 'off', \"we're\", 'during', 'haven', 'above', \"we've\", 'our', \"he'll\", 'whom', 'ain', 'is', \"she'll\", 'once', \"that'll\", \"needn't\", 'shan', 'weren', 'been', 'doing', 'wasn', 'needn', 'any', 'not', \"aren't\", \"won't\", 'myself', 'couldn', 'by', 'were', 'no', \"he's\", \"shan't\", 'very', \"i'd\", 'y', 'm', 'your', 'against', 'are', 'she', \"hasn't\", \"she'd\", \"you'll\", 'because', 'mightn', 'their', \"they'd\", 'nor', 'having', 'into', 'so', \"it's\", 'don', 'who', \"haven't\", 'his', 'what', 'why', 'we', 'i', \"i'm\", 'hadn', 'over', 'and', 'her', 'to', 'ma', 'a', 'it', \"isn't\", 'under', 'o', 'until', 'an', 'same', 'them', 'did', \"they're\", 'ourselves', 'as', 'its', \"wasn't\", 'doesn', 'just', 'yourselves', 'll', 'down', 'itself', \"i've\", 'should', 'shouldn', \"mightn't\", 'on', 'these', 'or', 'only', 'd', 'hasn', 'about', 'wouldn', \"couldn't\", 're', 'mustn', 'with', \"you'd\", 'few', 'in', 'the', 'out', \"don't\", 'him', \"wouldn't\", 'can', 'through', 'from', 'those', 'for', 'didn', 'you', 'below', 'up', 'themselves', \"didn't\", 'too', 'being', 'of', 'further', 'some', \"we'd\", \"i'll\", \"it'll\", 'while', \"doesn't\", \"mustn't\", 'that', 've', 'if', 'be', 'yourself', 'he', \"hadn't\", 'how', 'than', 'was', 'will', 'before', 'my', 't', 'theirs', 'at', \"weren't\", \"should've\", 'won', \"you're\", 'own', 'isn', \"you've\", 'such', 'himself', \"she's\", 'all', 'me', 'but', \"they'll\", \"he'd\", 'after', \"they've\", 'then', 'this', 'both', 'hers', 'herself', 'ours', \"it'd\", 'which', 'where', \"we'll\", 'each', 'between', 'there', 'yours', 'had', 'have', 'has', 'when', 'does', 'they'}\n",
"==============================================================\n",
"Tokenized Sentence:\n",
" ['hello', 'everyone', 'i', 'am', 'first', 'name', 'last', 'name', 'i', 'am', 'a', 'loyal', 'kska', 'git', 'user', 'all', 'the', 'way', 'from', 'sangamwadi', 'empire', 'i', 'have', 'considerable', 'knowledge', 'about', 'life', 'python', 'c', 'java', 'rust', 'golang', 'and', 'blockchain', 'for', 'every', 'smart', 'contract', 'i', 'lose', 'one', 'strand', 'of', 'my', 'hair', 'in', 'my', 'free', 'time', 'which', 'by', 'the', 'way', 'i', 'barely', 'get', 'i', 'like', 'to', 'swim']\n",
"\n",
"Filtered Sentence:\n",
" ['hello', 'everyone', 'first', 'name', 'last', 'name', 'loyal', 'kska', 'git', 'user', 'way', 'sangamwadi', 'empire', 'considerable', 'knowledge', 'life', 'python', 'c', 'java', 'rust', 'golang', 'blockchain', 'every', 'smart', 'contract', 'lose', 'one', 'strand', 'hair', 'free', 'time', 'way', 'barely', 'get', 'like', 'swim']\n"
]
}
],
"source": [
"# Removing stop words\n",
"var3 = set(stopwords.words('english'))\n",
"print(\"Stop words:\\n\", var3)\n",
"print(\"==============================================================\")\n",
"tokens = word_tokenize(text.lower())\n",
"filtered_text = []\n",
"for word in tokens:\n",
" if word not in var3:\n",
" filtered_text.append(word)\n",
"print(\"Tokenized Sentence:\\n\", tokens)\n",
"print(\"\\nFiltered Sentence:\\n\", filtered_text)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "1b6d55c9-5724-4abb-bcb2-4fc5d27cbe12",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"write\n",
"write\n",
"wrote\n",
"write\n",
"read\n",
"read\n"
]
}
],
"source": [
"# Stemmatization\n",
"var = [\"write\", \"writing\", \"wrote\", \"writes\",\"reading\",\"reads\"]\n",
"ps = PorterStemmer() # brings word to its root form\n",
"for w in var:\n",
" root_word = ps.stem(w)\n",
" print(root_word)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6e8f62c1-d4ae-48a8-8d3e-a86366ed7972",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text is:\t ['studies', 'studying', 'cries', 'cry']\n",
"Lemma for studies is study\n",
"Lemma for studying is studying\n",
"Lemma for cries is cry\n",
"Lemma for cry is cry\n"
]
}
],
"source": [
"# Lemmatization\n",
"wordnet_lemmatizer = WordNetLemmatizer()\n",
"text = \"studies studying cries cry\"\n",
"tt = nltk.word_tokenize(text)\n",
"print(\"Text is:\\t\", tt)\n",
"for w in tt:\n",
" print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w)))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c2ff8017-d03a-412f-b1f8-e2fb0b70bfca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Hello', 'NNP'),\n",
" ('everyone', 'NN'),\n",
" ('this', 'DT'),\n",
" ('is', 'VBZ'),\n",
" ('a', 'DT'),\n",
" ('sample', 'JJ'),\n",
" ('text', 'NN'),\n",
" ('!', '.'),\n",
" ('Earth', 'NN'),\n",
" ('.', '.')]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# POS Tagging\n",
"text = \"Hello everyone this is a sample text! Earth.\"\n",
"text = nltk.word_tokenize(text)\n",
"nltk.pos_tag(text)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "1d71007a-a8cb-45ab-af27-ec69b6826ddd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TF-IDF\n",
"['an' 'example' 'frequency' 'is' 'meow' 'of' 'term' 'this'] [[0.1767767 0.1767767 0.1767767 0.1767767 0.88388348 0.1767767\n",
" 0.1767767 0.1767767 ]]\n"
]
}
],
"source": [
"# TF-IDF (Term Frequency & Inverse Document Frequency)\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"new_sentence = \"This is an example of term frequency. Meow meow meow meow meow!\"\n",
"\n",
"def calculate_tfIdf(document):\n",
" tokenizer = TfidfVectorizer()\n",
" tf_matrix = tokenizer.fit_transform(document)\n",
" features_names = tokenizer.get_feature_names_out()\n",
" return tf_matrix, features_names\n",
"\n",
"# Wrap the new_sentence in a list\n",
"document = [new_sentence]\n",
"tf_matrix, feature_names = calculate_tfIdf(document)\n",
"\n",
"print('TF-IDF')\n",
"print(feature_names, tf_matrix.toarray())"
]
},
{
"cell_type": "markdown",
"id": "b34b21ba-46d6-4bad-b001-aa9962cc17b0",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long