Files
DataScienceAndBigDataAnalytics/Notebooks/Notebook-A2 (Data Wrangling-2).ipynb
T

984 lines
33 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "6ce3d040-a50f-459c-b494-e172f2897780",
"metadata": {},
"source": [
"# Notebook-A2 (Data Wrangling-2)\n",
"\n",
"- “Academic performance” wala\n",
"- Dataset generated here, not imported"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ce0d3ca6-fec0-4d3f-82b5-8ef92256525a",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"# import pandas as shriniwas\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "24b47cb9-c955-4325-ad62-4b215d73398c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nIF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\\n\\ndata = {\\n 'Student_id': [1,2,3,4,5,6,7,8,9,10],\\n 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\\n 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\\n 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\\n 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\\n [58, 96, 61]],\\n 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\\n 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\\n}\\n\""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Generate data\n",
"np.random.seed(50) #for consistency\n",
"\n",
"data = {\n",
" 'Student_id': range(1, 51),\n",
" 'Name': ['Student_' + str(i) for i in range(1, 51)],\n",
" 'Age': np.random.randint(18, 25, size=50),\n",
" 'Gender': np.random.choice(['Male', 'Female'], size=50),\n",
" 'Scores': [np.random.randint(50, 100, size=3).tolist() for _ in range(50)],\n",
" 'Attendance': np.random.randint(20,100,size=50),\n",
" 'Grade': np.random.choice(['A', 'B', 'C', 'D', 'F'], size=50)\n",
"}\n",
"\n",
"\"\"\"\n",
"IF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\n",
"\n",
"data = {\n",
" 'Student_id': [1,2,3,4,5,6,7,8,9,10],\n",
" 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\n",
" 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\n",
" 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\n",
" 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\n",
" [58, 96, 61]],\n",
" 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\n",
" 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "ba84d792-8cff-4a94-b936-6c6ae0bd8527",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19 Male [88, 77, 78] 32 F"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import data into DataFrame\n",
"df = pd.DataFrame(data)\n",
"df.head() # Print first 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "8d9d0a15-5a81-4b3d-90de-0efd631b0ec8",
"metadata": {},
"outputs": [],
"source": [
"# Assign grades\n",
"def assign_grade(scores):\n",
" avg_score = np.mean(scores)\n",
"\n",
" if avg_score > 90:\n",
" return 'A'\n",
" elif avg_score > 80:\n",
" return 'B'\n",
" elif avg_score > 70:\n",
" return 'C'\n",
" elif avg_score > 60:\n",
" return 'D'\n",
" else:\n",
" return 'F'\n",
"\n",
"df['Grade'] = df['Scores'].apply(assign_grade)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b2480d57-75c9-48d3-94ec-5b3f0a459007",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18.0</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18.0</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23.0</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19.0</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Student_6</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[81, 90, 65]</td>\n",
" <td>96</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Student_7</td>\n",
" <td>22.0</td>\n",
" <td>Female</td>\n",
" <td>[55, 97, 54]</td>\n",
" <td>73</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Student_8</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[54, 68, 97]</td>\n",
" <td>41</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Student_9</td>\n",
" <td>NaN</td>\n",
" <td>Male</td>\n",
" <td>[92, 67, 76]</td>\n",
" <td>98</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>Student_10</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[58, 96, 61]</td>\n",
" <td>105</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>Student_11</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[77, 77, 57]</td>\n",
" <td>65</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>Student_12</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>None</td>\n",
" <td>53</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>Student_13</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[85, 53, 71]</td>\n",
" <td>74</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>Student_14</td>\n",
" <td>20.0</td>\n",
" <td>Female</td>\n",
" <td>[92, 53, 56]</td>\n",
" <td>70</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>Student_15</td>\n",
" <td>20.0</td>\n",
" <td>Male</td>\n",
" <td>[65, 81, 72]</td>\n",
" <td>63</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>Student_16</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[50, 61, 80]</td>\n",
" <td>52</td>\n",
" <td>Z</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>Student_17</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[83, 99, 64]</td>\n",
" <td>88</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>Student_18</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[76, 72, 96]</td>\n",
" <td>70</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>Student_19</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[87, 56, 80]</td>\n",
" <td>79</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>Student_20</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>None</td>\n",
" <td>61</td>\n",
" <td>C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 F\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 D\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 NaN Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 D\n",
"11 12 Student_12 23.0 Male None 53 A\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 C\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 A\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 D\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 C\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 D\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 B\n",
"19 20 Student_20 21.0 Male None 61 C"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Introduce missing + invalid values and inconsistencies\n",
"df = pd.DataFrame(data)\n",
"df.loc[8, 'Age'] = np.nan\n",
"df.loc[29, 'Age'] = np.nan\n",
"df.loc[35, 'Age'] = np.nan\n",
"df.loc[11, 'Scores'] = None\n",
"df.loc[19, 'Scores'] = None\n",
"df.loc[9, 'Attendance'] = 105 # invalid percentage\n",
"df.loc[15, 'Grade'] = 'Z' # invalid grade\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "629f27c9-bca3-404f-8c8f-fae7a3882db8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values:\n",
" Student_id 0\n",
"Name 0\n",
"Age 3\n",
"Gender 0\n",
"Scores 2\n",
"Attendance 0\n",
"Grade 0\n",
"dtype: int64\n",
"Invalid attendance:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"Invalid grades:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n"
]
}
],
"source": [
"# Locating & printing missing/invalid values\n",
"missing_values = df.isnull().sum() #check missing values\n",
"invalid_attendance = df[(df['Attendance'] < 0) | (df['Attendance'] > 100)]\n",
"invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]\n",
"\n",
"print(\"Missing values:\\n\", missing_values)\n",
"print(\"Invalid attendance:\\n\", invalid_attendance)\n",
"print(\"Invalid grades:\\n\", invalid_grades)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "4e46dfbe-693b-4f3b-a9a3-a0d243cd5214",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Student_id</th>\n",
" <th>Name</th>\n",
" <th>Age</th>\n",
" <th>Gender</th>\n",
" <th>Scores</th>\n",
" <th>Attendance</th>\n",
" <th>Grade</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Student_1</td>\n",
" <td>18.0</td>\n",
" <td>Female</td>\n",
" <td>[64, 54, 72]</td>\n",
" <td>55</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Student_2</td>\n",
" <td>18.0</td>\n",
" <td>Male</td>\n",
" <td>[93, 69, 82]</td>\n",
" <td>23</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Student_3</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[87, 90, 80]</td>\n",
" <td>84</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Student_4</td>\n",
" <td>23.0</td>\n",
" <td>Female</td>\n",
" <td>[94, 93, 85]</td>\n",
" <td>66</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Student_5</td>\n",
" <td>19.0</td>\n",
" <td>Male</td>\n",
" <td>[88, 77, 78]</td>\n",
" <td>32</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Student_6</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[81, 90, 65]</td>\n",
" <td>96</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Student_7</td>\n",
" <td>22.0</td>\n",
" <td>Female</td>\n",
" <td>[55, 97, 54]</td>\n",
" <td>73</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Student_8</td>\n",
" <td>24.0</td>\n",
" <td>Male</td>\n",
" <td>[54, 68, 97]</td>\n",
" <td>41</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Student_9</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>[92, 67, 76]</td>\n",
" <td>98</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>Student_10</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[58, 96, 61]</td>\n",
" <td>100</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>Student_11</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[77, 77, 57]</td>\n",
" <td>65</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>Student_12</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[0, 0, 0]</td>\n",
" <td>53</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>Student_13</td>\n",
" <td>23.0</td>\n",
" <td>Male</td>\n",
" <td>[85, 53, 71]</td>\n",
" <td>74</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>Student_14</td>\n",
" <td>20.0</td>\n",
" <td>Female</td>\n",
" <td>[92, 53, 56]</td>\n",
" <td>70</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>Student_15</td>\n",
" <td>20.0</td>\n",
" <td>Male</td>\n",
" <td>[65, 81, 72]</td>\n",
" <td>63</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>Student_16</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[50, 61, 80]</td>\n",
" <td>52</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>Student_17</td>\n",
" <td>24.0</td>\n",
" <td>Female</td>\n",
" <td>[83, 99, 64]</td>\n",
" <td>88</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>Student_18</td>\n",
" <td>21.0</td>\n",
" <td>Female</td>\n",
" <td>[76, 72, 96]</td>\n",
" <td>70</td>\n",
" <td>B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>Student_19</td>\n",
" <td>22.0</td>\n",
" <td>Male</td>\n",
" <td>[87, 56, 80]</td>\n",
" <td>79</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>Student_20</td>\n",
" <td>21.0</td>\n",
" <td>Male</td>\n",
" <td>[0, 0, 0]</td>\n",
" <td>61</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 D\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 B\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 B\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 A\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 B\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Handling missing/invalid values\n",
"df['Age'] = df['Age'].fillna(df['Age'].median()) #fill by median\n",
"df['Attendance'] = df['Attendance'].apply(lambda x: 100 if x > 100 else (0 if x < 0 else x))\n",
"\n",
"def handle_invalid_scores(scores):\n",
" if scores is None:\n",
" return [0, 0, 0]\n",
"\n",
" return [max(0, min(100, score)) for score in scores]\n",
"\n",
"df['Scores'] = df['Scores'].apply(handle_invalid_scores)\n",
"df['Grade'] = df['Scores'].apply(assign_grade)\n",
"df['Grade'] = df['Grade'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'F'] else 'F')\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "f7cfc3bb-91c4-4fa8-b723-ccd786cd8626",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Outliers:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 65.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 200 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 166 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F\n"
]
}
],
"source": [
"# Adding outiers\n",
"df.loc[5, 'Age'] = 35\n",
"df.loc[5, 'Age'] = 50\n",
"df.loc[5, 'Age'] = 65\n",
"df.loc[10, 'Attendance'] = 200\n",
"df.loc[12, 'Attendance'] = 175\n",
"df.loc[12, 'Attendance'] = 166\n",
"\n",
"print(\"DataFrame with Outliers:\")\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "92e0593b-99df-4e08-a8da-ee923936cb91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 26.875 Male [81, 90, 65] 96.00 C\n",
"6 7 Student_7 22.000 Female [55, 97, 54] 73.00 D\n",
"7 8 Student_8 24.000 Male [54, 68, 97] 41.00 C\n",
"8 9 Student_9 21.000 Male [92, 67, 76] 98.00 C\n",
"9 10 Student_10 24.000 Female [58, 96, 61] 100.00 C\n",
"10 11 Student_11 24.000 Female [77, 77, 57] 142.25 C\n",
"11 12 Student_12 23.000 Male [0, 0, 0] 53.00 F\n",
"12 13 Student_13 23.000 Male [85, 53, 71] 142.25 D\n",
"13 14 Student_14 20.000 Female [92, 53, 56] 70.00 D\n",
"14 15 Student_15 20.000 Male [65, 81, 72] 63.00 C\n",
"15 16 Student_16 22.000 Male [50, 61, 80] 52.00 D\n",
"16 17 Student_17 24.000 Female [83, 99, 64] 88.00 B\n",
"17 18 Student_18 21.000 Female [76, 72, 96] 70.00 B\n",
"18 19 Student_19 22.000 Male [87, 56, 80] 79.00 C\n",
"19 20 Student_20 21.000 Male [0, 0, 0] 61.00 F\n"
]
}
],
"source": [
"# Handling outliers\n",
"def handle_outliers_iqr(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
"\n",
" IQR = Q3 - Q1\n",
"\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
"\n",
" df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))\n",
"\n",
"handle_outliers_iqr(df, 'Age')\n",
"handle_outliers_iqr(df, 'Attendance')\n",
"\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9866733a-c095-402d-b6b0-0fd88ef31169",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Min-Max Scaling on 'Attendance':\n",
" Attendance Scaled_Attendance\n",
"0 55.00 0.286299\n",
"1 23.00 0.024540\n",
"2 84.00 0.523517\n",
"3 66.00 0.376278\n",
"4 32.00 0.098160\n",
"5 96.00 0.621677\n",
"6 73.00 0.433538\n",
"7 41.00 0.171779\n",
"8 98.00 0.638037\n",
"9 100.00 0.654397\n",
"10 142.25 1.000000\n",
"11 53.00 0.269939\n",
"12 142.25 1.000000\n",
"13 70.00 0.408998\n",
"14 63.00 0.351738\n",
"15 52.00 0.261759\n",
"16 88.00 0.556237\n",
"17 70.00 0.408998\n",
"18 79.00 0.482618\n",
"19 61.00 0.335378\n"
]
}
],
"source": [
"# Data transformation using min-max scaling\n",
"df['Scaled_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())\n",
"\n",
"print(\"DataFrame with Min-Max Scaling on 'Attendance':\")\n",
"print(df[['Attendance', 'Scaled_Attendance']].head(20))"
]
},
{
"cell_type": "markdown",
"id": "8a4b032f-18de-48a0-82cd-3afd9f4426e8",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}