572 lines
17 KiB
Plaintext
572 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "5ed97617-c927-48b4-857c-c4d75b98fe2c",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Notebook-A1 (Data Wrangling-1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "a93305d3-03c9-479e-bde3-fb0d06ea8d39",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Import libraries\n",
|
||
"import pandas as pd\n",
|
||
"import numpy as np"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "8bf62160-0384-4a3f-b25e-11e28c8b8df4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>sepal.length</th>\n",
|
||
" <th>sepal.width</th>\n",
|
||
" <th>petal.length</th>\n",
|
||
" <th>petal.width</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>150.000000</td>\n",
|
||
" <td>150.000000</td>\n",
|
||
" <td>150.000000</td>\n",
|
||
" <td>150.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>5.843333</td>\n",
|
||
" <td>3.057333</td>\n",
|
||
" <td>3.758000</td>\n",
|
||
" <td>1.199333</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>0.828066</td>\n",
|
||
" <td>0.435866</td>\n",
|
||
" <td>1.765298</td>\n",
|
||
" <td>0.762238</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>4.300000</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.100000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>5.100000</td>\n",
|
||
" <td>2.800000</td>\n",
|
||
" <td>1.600000</td>\n",
|
||
" <td>0.300000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>5.800000</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" <td>4.350000</td>\n",
|
||
" <td>1.300000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>6.400000</td>\n",
|
||
" <td>3.300000</td>\n",
|
||
" <td>5.100000</td>\n",
|
||
" <td>1.800000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>7.900000</td>\n",
|
||
" <td>4.400000</td>\n",
|
||
" <td>6.900000</td>\n",
|
||
" <td>2.500000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" sepal.length sepal.width petal.length petal.width\n",
|
||
"count 150.000000 150.000000 150.000000 150.000000\n",
|
||
"mean 5.843333 3.057333 3.758000 1.199333\n",
|
||
"std 0.828066 0.435866 1.765298 0.762238\n",
|
||
"min 4.300000 2.000000 1.000000 0.100000\n",
|
||
"25% 5.100000 2.800000 1.600000 0.300000\n",
|
||
"50% 5.800000 3.000000 4.350000 1.300000\n",
|
||
"75% 6.400000 3.300000 5.100000 1.800000\n",
|
||
"max 7.900000 4.400000 6.900000 2.500000"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Load the dataset into DataFrame\n",
|
||
"df=pd.read_csv('iris.csv')\n",
|
||
"df.describe() # Print description of DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "7c813f91-5100-463a-a848-2ef9f46344bc",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"First 5 values:\n",
|
||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||
"0 5.1 3.5 1.4 0.2 Setosa\n",
|
||
"1 4.9 3.0 1.4 0.2 Setosa\n",
|
||
"2 4.7 3.2 1.3 0.2 Setosa\n",
|
||
"3 4.6 3.1 1.5 0.2 Setosa\n",
|
||
"4 5.0 3.6 1.4 0.2 Setosa\n",
|
||
"Last 5 values:\n",
|
||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||
"145 6.7 3.0 5.2 2.3 Virginica\n",
|
||
"146 6.3 2.5 5.0 1.9 Virginica\n",
|
||
"147 6.5 3.0 5.2 2.0 Virginica\n",
|
||
"148 6.2 3.4 5.4 2.3 Virginica\n",
|
||
"149 5.9 3.0 5.1 1.8 Virginica\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print first and last 5 values\n",
|
||
"print(\"First 5 values:\\n\", df.head())\n",
|
||
"print (\"Last 5 values:\\n\", df.tail())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "78fa6d44-7c39-4306-a5d7-0fd5ff94243a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 False\n",
|
||
"1 False\n",
|
||
"2 False\n",
|
||
"3 False\n",
|
||
"4 False\n",
|
||
" ... \n",
|
||
"145 False\n",
|
||
"146 False\n",
|
||
"147 False\n",
|
||
"148 False\n",
|
||
"149 False\n",
|
||
"Length: 150, dtype: bool"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print duplicated values\n",
|
||
"df.duplicated()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "5da58b1d-c458-4eb5-b23c-053c02934efd",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>sepal.length</th>\n",
|
||
" <th>sepal.width</th>\n",
|
||
" <th>petal.length</th>\n",
|
||
" <th>petal.width</th>\n",
|
||
" <th>variety</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>145</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>146</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>147</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>148</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>149</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>150 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||
"0 False False False False False\n",
|
||
"1 False False False False False\n",
|
||
"2 False False False False False\n",
|
||
"3 False False False False False\n",
|
||
"4 False False False False False\n",
|
||
".. ... ... ... ... ...\n",
|
||
"145 False False False False False\n",
|
||
"146 False False False False False\n",
|
||
"147 False False False False False\n",
|
||
"148 False False False False False\n",
|
||
"149 False False False False False\n",
|
||
"\n",
|
||
"[150 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print null values true/false\n",
|
||
"df.isnull()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "3b6face6-d366-4a05-9fd4-baff133e24f6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 150 entries, 0 to 149\n",
|
||
"Data columns (total 5 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 sepal.length 150 non-null float64\n",
|
||
" 1 sepal.width 150 non-null float64\n",
|
||
" 2 petal.length 150 non-null float64\n",
|
||
" 3 petal.width 150 non-null float64\n",
|
||
" 4 variety 150 non-null object \n",
|
||
"dtypes: float64(4), object(1)\n",
|
||
"memory usage: 6.0+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print summary of DataFrame\n",
|
||
"df.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "f08a00a4-c82a-49a9-bbaa-427d5cc4db96",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(150, 5)"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print shape, i.e. rows + columns\n",
|
||
"df.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "186866fe-7614-4a4d-b929-31bd37f80027",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 False\n",
|
||
"1 False\n",
|
||
"2 False\n",
|
||
"3 False\n",
|
||
"4 False\n",
|
||
" ... \n",
|
||
"145 False\n",
|
||
"146 False\n",
|
||
"147 False\n",
|
||
"148 False\n",
|
||
"149 False\n",
|
||
"Name: sepal.length, Length: 150, dtype: bool"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print null (true/false) values in sepal.length column\n",
|
||
"df[\"sepal.length\"].isnull()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "5bc1e082-9818-450d-b76d-5dd3a2103e00",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" sepal.length sepal.width petal.width variety\n",
|
||
"0 5.1 3.5 0.2 Setosa\n",
|
||
"1 4.9 3.0 0.2 Setosa\n",
|
||
"2 4.7 3.2 0.2 Setosa\n",
|
||
"3 4.6 3.1 0.2 Setosa\n",
|
||
"4 5.0 3.6 0.2 Setosa\n",
|
||
".. ... ... ... ...\n",
|
||
"145 6.7 3.0 2.3 Virginica\n",
|
||
"146 6.3 2.5 1.9 Virginica\n",
|
||
"147 6.5 3.0 2.0 Virginica\n",
|
||
"148 6.2 3.4 2.3 Virginica\n",
|
||
"149 5.9 3.0 1.8 Virginica\n",
|
||
"\n",
|
||
"[150 rows x 4 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Delete/Drop petal.length column\n",
|
||
"y = df.drop([\"petal.length\"], axis=1) # axis=1 column. For row, axis=0\n",
|
||
"print(y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "dcccff24-c97e-4832-9cc1-cd701a3a9a34",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" sepal.length sepal.width petal.length petal.width variety\n",
|
||
"0 5.1 3.5 1.4 0.2 0\n",
|
||
"1 4.9 3.0 1.4 0.2 0\n",
|
||
"2 4.7 3.2 1.3 0.2 0\n",
|
||
"3 4.6 3.1 1.5 0.2 0\n",
|
||
"4 5.0 3.6 1.4 0.2 0\n",
|
||
".. ... ... ... ... ...\n",
|
||
"145 6.7 3.0 5.2 2.3 1\n",
|
||
"146 6.3 2.5 5.0 1.9 1\n",
|
||
"147 6.5 3.0 5.2 2.0 1\n",
|
||
"148 6.2 3.4 5.4 2.3 1\n",
|
||
"149 5.9 3.0 5.1 1.8 1\n",
|
||
"\n",
|
||
"[150 rows x 5 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# In variety column, replace Setosa with 0 and Virginica with 1\n",
|
||
"df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)\n",
|
||
"print(df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "97d22793-657e-4df7-a7f1-01d70886ef57",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"sepal.length 0\n",
|
||
"sepal.width 0\n",
|
||
"petal.length 0\n",
|
||
"petal.width 0\n",
|
||
"variety 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Print sum of NULL values in each column\n",
|
||
"df.isnull().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e3f0c5c4-a930-4dbc-8300-99fb7cb7c991",
|
||
"metadata": {},
|
||
"source": [
|
||
"---"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.20"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|