{
"cells": [
{
"cell_type": "markdown",
"id": "6ce3d040-a50f-459c-b494-e172f2897780",
"metadata": {},
"source": [
"# Notebook-A2 (Data Wrangling-2)\n",
"\n",
"- “Academic performance” wala\n",
"- Dataset generated here, not imported"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ce0d3ca6-fec0-4d3f-82b5-8ef92256525a",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"# import pandas as shriniwas\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "24b47cb9-c955-4325-ad62-4b215d73398c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nIF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\\n\\ndata = {\\n 'Student_id': [1,2,3,4,5,6,7,8,9,10],\\n 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\\n 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\\n 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\\n 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\\n [58, 96, 61]],\\n 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\\n 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\\n}\\n\""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Generate data\n",
"np.random.seed(50) #for consistency\n",
"\n",
"data = {\n",
" 'Student_id': range(1, 51),\n",
" 'Name': ['Student_' + str(i) for i in range(1, 51)],\n",
" 'Age': np.random.randint(18, 25, size=50),\n",
" 'Gender': np.random.choice(['Male', 'Female'], size=50),\n",
" 'Scores': [np.random.randint(50, 100, size=3).tolist() for _ in range(50)],\n",
" 'Attendance': np.random.randint(20,100,size=50),\n",
" 'Grade': np.random.choice(['A', 'B', 'C', 'D', 'F'], size=50)\n",
"}\n",
"\n",
"\"\"\"\n",
"IF YOU WISH TO MANUALLY ENTER DATA, YOU CAN DO SO. HERE'S AN EXAMPLE\n",
"\n",
"data = {\n",
" 'Student_id': [1,2,3,4,5,6,7,8,9,10],\n",
" 'Name': ['Ayan', 'Priya', 'Sahil', 'Riya', 'Kunal', 'Tanya', 'Rahul', 'Anjali', 'Raj', 'Neha'],\n",
" 'Age': [18, 20, 21, 22, 25, 18, 18, 19, 23, 24],\n",
" 'Gender': ['Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female'],\n",
" 'Scores': [[64, 54, 72], [93, 69, 82], [87, 90, 80], [94, 93, 85], [88, 77, 78], [81, 90, 65], [55, 97, 54], [54, 68, 97], [92, 67, 76],\n",
" [58, 96, 61]],\n",
" 'Attendance': [92, 95, 85, 88, 96, 80, 97, 78, 93, 89],\n",
" 'Grade': ['B', 'C', 'F', 'C', 'F', 'D', 'D', 'C', 'C', 'A']\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "ba84d792-8cff-4a94-b936-6c6ae0bd8527",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Student_id | \n",
" Name | \n",
" Age | \n",
" Gender | \n",
" Scores | \n",
" Attendance | \n",
" Grade | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Student_1 | \n",
" 18 | \n",
" Female | \n",
" [64, 54, 72] | \n",
" 55 | \n",
" B | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Student_2 | \n",
" 18 | \n",
" Male | \n",
" [93, 69, 82] | \n",
" 23 | \n",
" C | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Student_3 | \n",
" 21 | \n",
" Female | \n",
" [87, 90, 80] | \n",
" 84 | \n",
" F | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" Student_4 | \n",
" 23 | \n",
" Female | \n",
" [94, 93, 85] | \n",
" 66 | \n",
" C | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" Student_5 | \n",
" 19 | \n",
" Male | \n",
" [88, 77, 78] | \n",
" 32 | \n",
" F | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19 Male [88, 77, 78] 32 F"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import data into DataFrame\n",
"df = pd.DataFrame(data)\n",
"df.head() # Print first 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "8d9d0a15-5a81-4b3d-90de-0efd631b0ec8",
"metadata": {},
"outputs": [],
"source": [
"# Assign grades\n",
"def assign_grade(scores):\n",
" avg_score = np.mean(scores)\n",
"\n",
" if avg_score > 90:\n",
" return 'A'\n",
" elif avg_score > 80:\n",
" return 'B'\n",
" elif avg_score > 70:\n",
" return 'C'\n",
" elif avg_score > 60:\n",
" return 'D'\n",
" else:\n",
" return 'F'\n",
"\n",
"df['Grade'] = df['Scores'].apply(assign_grade)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b2480d57-75c9-48d3-94ec-5b3f0a459007",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Student_id | \n",
" Name | \n",
" Age | \n",
" Gender | \n",
" Scores | \n",
" Attendance | \n",
" Grade | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Student_1 | \n",
" 18.0 | \n",
" Female | \n",
" [64, 54, 72] | \n",
" 55 | \n",
" B | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Student_2 | \n",
" 18.0 | \n",
" Male | \n",
" [93, 69, 82] | \n",
" 23 | \n",
" C | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Student_3 | \n",
" 21.0 | \n",
" Female | \n",
" [87, 90, 80] | \n",
" 84 | \n",
" F | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" Student_4 | \n",
" 23.0 | \n",
" Female | \n",
" [94, 93, 85] | \n",
" 66 | \n",
" C | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" Student_5 | \n",
" 19.0 | \n",
" Male | \n",
" [88, 77, 78] | \n",
" 32 | \n",
" F | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
" Student_6 | \n",
" 24.0 | \n",
" Male | \n",
" [81, 90, 65] | \n",
" 96 | \n",
" D | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
" Student_7 | \n",
" 22.0 | \n",
" Female | \n",
" [55, 97, 54] | \n",
" 73 | \n",
" D | \n",
"
\n",
" \n",
" | 7 | \n",
" 8 | \n",
" Student_8 | \n",
" 24.0 | \n",
" Male | \n",
" [54, 68, 97] | \n",
" 41 | \n",
" C | \n",
"
\n",
" \n",
" | 8 | \n",
" 9 | \n",
" Student_9 | \n",
" NaN | \n",
" Male | \n",
" [92, 67, 76] | \n",
" 98 | \n",
" C | \n",
"
\n",
" \n",
" | 9 | \n",
" 10 | \n",
" Student_10 | \n",
" 24.0 | \n",
" Female | \n",
" [58, 96, 61] | \n",
" 105 | \n",
" A | \n",
"
\n",
" \n",
" | 10 | \n",
" 11 | \n",
" Student_11 | \n",
" 24.0 | \n",
" Female | \n",
" [77, 77, 57] | \n",
" 65 | \n",
" D | \n",
"
\n",
" \n",
" | 11 | \n",
" 12 | \n",
" Student_12 | \n",
" 23.0 | \n",
" Male | \n",
" None | \n",
" 53 | \n",
" A | \n",
"
\n",
" \n",
" | 12 | \n",
" 13 | \n",
" Student_13 | \n",
" 23.0 | \n",
" Male | \n",
" [85, 53, 71] | \n",
" 74 | \n",
" C | \n",
"
\n",
" \n",
" | 13 | \n",
" 14 | \n",
" Student_14 | \n",
" 20.0 | \n",
" Female | \n",
" [92, 53, 56] | \n",
" 70 | \n",
" A | \n",
"
\n",
" \n",
" | 14 | \n",
" 15 | \n",
" Student_15 | \n",
" 20.0 | \n",
" Male | \n",
" [65, 81, 72] | \n",
" 63 | \n",
" D | \n",
"
\n",
" \n",
" | 15 | \n",
" 16 | \n",
" Student_16 | \n",
" 22.0 | \n",
" Male | \n",
" [50, 61, 80] | \n",
" 52 | \n",
" Z | \n",
"
\n",
" \n",
" | 16 | \n",
" 17 | \n",
" Student_17 | \n",
" 24.0 | \n",
" Female | \n",
" [83, 99, 64] | \n",
" 88 | \n",
" C | \n",
"
\n",
" \n",
" | 17 | \n",
" 18 | \n",
" Student_18 | \n",
" 21.0 | \n",
" Female | \n",
" [76, 72, 96] | \n",
" 70 | \n",
" D | \n",
"
\n",
" \n",
" | 18 | \n",
" 19 | \n",
" Student_19 | \n",
" 22.0 | \n",
" Male | \n",
" [87, 56, 80] | \n",
" 79 | \n",
" B | \n",
"
\n",
" \n",
" | 19 | \n",
" 20 | \n",
" Student_20 | \n",
" 21.0 | \n",
" Male | \n",
" None | \n",
" 61 | \n",
" C | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 B\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 C\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 F\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 C\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 F\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 D\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 NaN Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 D\n",
"11 12 Student_12 23.0 Male None 53 A\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 C\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 A\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 D\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 C\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 D\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 B\n",
"19 20 Student_20 21.0 Male None 61 C"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Introduce missing + invalid values and inconsistencies\n",
"df = pd.DataFrame(data)\n",
"df.loc[8, 'Age'] = np.nan\n",
"df.loc[29, 'Age'] = np.nan\n",
"df.loc[35, 'Age'] = np.nan\n",
"df.loc[11, 'Scores'] = None\n",
"df.loc[19, 'Scores'] = None\n",
"df.loc[9, 'Attendance'] = 105 # invalid percentage\n",
"df.loc[15, 'Grade'] = 'Z' # invalid grade\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "629f27c9-bca3-404f-8c8f-fae7a3882db8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values:\n",
" Student_id 0\n",
"Name 0\n",
"Age 3\n",
"Gender 0\n",
"Scores 2\n",
"Attendance 0\n",
"Grade 0\n",
"dtype: int64\n",
"Invalid attendance:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 105 A\n",
"Invalid grades:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 Z\n"
]
}
],
"source": [
"# Locating & printing missing/invalid values\n",
"missing_values = df.isnull().sum() #check missing values\n",
"invalid_attendance = df[(df['Attendance'] < 0) | (df['Attendance'] > 100)]\n",
"invalid_grades = df[~df['Grade'].isin(['A', 'B', 'C', 'D', 'F'])]\n",
"\n",
"print(\"Missing values:\\n\", missing_values)\n",
"print(\"Invalid attendance:\\n\", invalid_attendance)\n",
"print(\"Invalid grades:\\n\", invalid_grades)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "4e46dfbe-693b-4f3b-a9a3-a0d243cd5214",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Student_id | \n",
" Name | \n",
" Age | \n",
" Gender | \n",
" Scores | \n",
" Attendance | \n",
" Grade | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Student_1 | \n",
" 18.0 | \n",
" Female | \n",
" [64, 54, 72] | \n",
" 55 | \n",
" D | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Student_2 | \n",
" 18.0 | \n",
" Male | \n",
" [93, 69, 82] | \n",
" 23 | \n",
" B | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Student_3 | \n",
" 21.0 | \n",
" Female | \n",
" [87, 90, 80] | \n",
" 84 | \n",
" B | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" Student_4 | \n",
" 23.0 | \n",
" Female | \n",
" [94, 93, 85] | \n",
" 66 | \n",
" A | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" Student_5 | \n",
" 19.0 | \n",
" Male | \n",
" [88, 77, 78] | \n",
" 32 | \n",
" B | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
" Student_6 | \n",
" 24.0 | \n",
" Male | \n",
" [81, 90, 65] | \n",
" 96 | \n",
" C | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
" Student_7 | \n",
" 22.0 | \n",
" Female | \n",
" [55, 97, 54] | \n",
" 73 | \n",
" D | \n",
"
\n",
" \n",
" | 7 | \n",
" 8 | \n",
" Student_8 | \n",
" 24.0 | \n",
" Male | \n",
" [54, 68, 97] | \n",
" 41 | \n",
" C | \n",
"
\n",
" \n",
" | 8 | \n",
" 9 | \n",
" Student_9 | \n",
" 21.0 | \n",
" Male | \n",
" [92, 67, 76] | \n",
" 98 | \n",
" C | \n",
"
\n",
" \n",
" | 9 | \n",
" 10 | \n",
" Student_10 | \n",
" 24.0 | \n",
" Female | \n",
" [58, 96, 61] | \n",
" 100 | \n",
" C | \n",
"
\n",
" \n",
" | 10 | \n",
" 11 | \n",
" Student_11 | \n",
" 24.0 | \n",
" Female | \n",
" [77, 77, 57] | \n",
" 65 | \n",
" C | \n",
"
\n",
" \n",
" | 11 | \n",
" 12 | \n",
" Student_12 | \n",
" 23.0 | \n",
" Male | \n",
" [0, 0, 0] | \n",
" 53 | \n",
" F | \n",
"
\n",
" \n",
" | 12 | \n",
" 13 | \n",
" Student_13 | \n",
" 23.0 | \n",
" Male | \n",
" [85, 53, 71] | \n",
" 74 | \n",
" D | \n",
"
\n",
" \n",
" | 13 | \n",
" 14 | \n",
" Student_14 | \n",
" 20.0 | \n",
" Female | \n",
" [92, 53, 56] | \n",
" 70 | \n",
" D | \n",
"
\n",
" \n",
" | 14 | \n",
" 15 | \n",
" Student_15 | \n",
" 20.0 | \n",
" Male | \n",
" [65, 81, 72] | \n",
" 63 | \n",
" C | \n",
"
\n",
" \n",
" | 15 | \n",
" 16 | \n",
" Student_16 | \n",
" 22.0 | \n",
" Male | \n",
" [50, 61, 80] | \n",
" 52 | \n",
" D | \n",
"
\n",
" \n",
" | 16 | \n",
" 17 | \n",
" Student_17 | \n",
" 24.0 | \n",
" Female | \n",
" [83, 99, 64] | \n",
" 88 | \n",
" B | \n",
"
\n",
" \n",
" | 17 | \n",
" 18 | \n",
" Student_18 | \n",
" 21.0 | \n",
" Female | \n",
" [76, 72, 96] | \n",
" 70 | \n",
" B | \n",
"
\n",
" \n",
" | 18 | \n",
" 19 | \n",
" Student_19 | \n",
" 22.0 | \n",
" Male | \n",
" [87, 56, 80] | \n",
" 79 | \n",
" C | \n",
"
\n",
" \n",
" | 19 | \n",
" 20 | \n",
" Student_20 | \n",
" 21.0 | \n",
" Male | \n",
" [0, 0, 0] | \n",
" 61 | \n",
" F | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"0 1 Student_1 18.0 Female [64, 54, 72] 55 D\n",
"1 2 Student_2 18.0 Male [93, 69, 82] 23 B\n",
"2 3 Student_3 21.0 Female [87, 90, 80] 84 B\n",
"3 4 Student_4 23.0 Female [94, 93, 85] 66 A\n",
"4 5 Student_5 19.0 Male [88, 77, 78] 32 B\n",
"5 6 Student_6 24.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 65 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 74 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Handling missing/invalid values\n",
"df['Age'] = df['Age'].fillna(df['Age'].median()) #fill by median\n",
"df['Attendance'] = df['Attendance'].apply(lambda x: 100 if x > 100 else (0 if x < 0 else x))\n",
"\n",
"def handle_invalid_scores(scores):\n",
" if scores is None:\n",
" return [0, 0, 0]\n",
"\n",
" return [max(0, min(100, score)) for score in scores]\n",
"\n",
"df['Scores'] = df['Scores'].apply(handle_invalid_scores)\n",
"df['Grade'] = df['Scores'].apply(assign_grade)\n",
"df['Grade'] = df['Grade'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'F'] else 'F')\n",
"df.head(20) # Print first 20 rows"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "f7cfc3bb-91c4-4fa8-b723-ccd786cd8626",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Outliers:\n",
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 65.0 Male [81, 90, 65] 96 C\n",
"6 7 Student_7 22.0 Female [55, 97, 54] 73 D\n",
"7 8 Student_8 24.0 Male [54, 68, 97] 41 C\n",
"8 9 Student_9 21.0 Male [92, 67, 76] 98 C\n",
"9 10 Student_10 24.0 Female [58, 96, 61] 100 C\n",
"10 11 Student_11 24.0 Female [77, 77, 57] 200 C\n",
"11 12 Student_12 23.0 Male [0, 0, 0] 53 F\n",
"12 13 Student_13 23.0 Male [85, 53, 71] 166 D\n",
"13 14 Student_14 20.0 Female [92, 53, 56] 70 D\n",
"14 15 Student_15 20.0 Male [65, 81, 72] 63 C\n",
"15 16 Student_16 22.0 Male [50, 61, 80] 52 D\n",
"16 17 Student_17 24.0 Female [83, 99, 64] 88 B\n",
"17 18 Student_18 21.0 Female [76, 72, 96] 70 B\n",
"18 19 Student_19 22.0 Male [87, 56, 80] 79 C\n",
"19 20 Student_20 21.0 Male [0, 0, 0] 61 F\n"
]
}
],
"source": [
"# Adding outiers\n",
"df.loc[5, 'Age'] = 35\n",
"df.loc[5, 'Age'] = 50\n",
"df.loc[5, 'Age'] = 65\n",
"df.loc[10, 'Attendance'] = 200\n",
"df.loc[12, 'Attendance'] = 175\n",
"df.loc[12, 'Attendance'] = 166\n",
"\n",
"print(\"DataFrame with Outliers:\")\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "92e0593b-99df-4e08-a8da-ee923936cb91",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Student_id Name Age Gender Scores Attendance Grade\n",
"5 6 Student_6 26.875 Male [81, 90, 65] 96.00 C\n",
"6 7 Student_7 22.000 Female [55, 97, 54] 73.00 D\n",
"7 8 Student_8 24.000 Male [54, 68, 97] 41.00 C\n",
"8 9 Student_9 21.000 Male [92, 67, 76] 98.00 C\n",
"9 10 Student_10 24.000 Female [58, 96, 61] 100.00 C\n",
"10 11 Student_11 24.000 Female [77, 77, 57] 142.25 C\n",
"11 12 Student_12 23.000 Male [0, 0, 0] 53.00 F\n",
"12 13 Student_13 23.000 Male [85, 53, 71] 142.25 D\n",
"13 14 Student_14 20.000 Female [92, 53, 56] 70.00 D\n",
"14 15 Student_15 20.000 Male [65, 81, 72] 63.00 C\n",
"15 16 Student_16 22.000 Male [50, 61, 80] 52.00 D\n",
"16 17 Student_17 24.000 Female [83, 99, 64] 88.00 B\n",
"17 18 Student_18 21.000 Female [76, 72, 96] 70.00 B\n",
"18 19 Student_19 22.000 Male [87, 56, 80] 79.00 C\n",
"19 20 Student_20 21.000 Male [0, 0, 0] 61.00 F\n"
]
}
],
"source": [
"# Handling outliers\n",
"def handle_outliers_iqr(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
"\n",
" IQR = Q3 - Q1\n",
"\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
"\n",
" df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))\n",
"\n",
"handle_outliers_iqr(df, 'Age')\n",
"handle_outliers_iqr(df, 'Attendance')\n",
"\n",
"print(df.iloc[5:20])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9866733a-c095-402d-b6b0-0fd88ef31169",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame with Min-Max Scaling on 'Attendance':\n",
" Attendance Scaled_Attendance\n",
"0 55.00 0.286299\n",
"1 23.00 0.024540\n",
"2 84.00 0.523517\n",
"3 66.00 0.376278\n",
"4 32.00 0.098160\n",
"5 96.00 0.621677\n",
"6 73.00 0.433538\n",
"7 41.00 0.171779\n",
"8 98.00 0.638037\n",
"9 100.00 0.654397\n",
"10 142.25 1.000000\n",
"11 53.00 0.269939\n",
"12 142.25 1.000000\n",
"13 70.00 0.408998\n",
"14 63.00 0.351738\n",
"15 52.00 0.261759\n",
"16 88.00 0.556237\n",
"17 70.00 0.408998\n",
"18 79.00 0.482618\n",
"19 61.00 0.335378\n"
]
}
],
"source": [
"# Data transformation using min-max scaling\n",
"df['Scaled_Attendance'] = (df['Attendance'] - df['Attendance'].min()) / (df['Attendance'].max() - df['Attendance'].min())\n",
"\n",
"print(\"DataFrame with Min-Max Scaling on 'Attendance':\")\n",
"print(df[['Attendance', 'Scaled_Attendance']].head(20))"
]
},
{
"cell_type": "markdown",
"id": "8a4b032f-18de-48a0-82cd-3afd9f4426e8",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}