Files
DataScienceAndBigDataAnalytics/Notebooks/Notebook-A1 (Data Wrangling-1).ipynb

572 lines
17 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "5ed97617-c927-48b4-857c-c4d75b98fe2c",
"metadata": {},
"source": [
"# Notebook-A1 (Data Wrangling-1)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a93305d3-03c9-479e-bde3-fb0d06ea8d39",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8bf62160-0384-4a3f-b25e-11e28c8b8df4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal.length</th>\n",
" <th>sepal.width</th>\n",
" <th>petal.length</th>\n",
" <th>petal.width</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" <td>150.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>5.843333</td>\n",
" <td>3.057333</td>\n",
" <td>3.758000</td>\n",
" <td>1.199333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.828066</td>\n",
" <td>0.435866</td>\n",
" <td>1.765298</td>\n",
" <td>0.762238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>4.300000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.100000</td>\n",
" <td>2.800000</td>\n",
" <td>1.600000</td>\n",
" <td>0.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>5.800000</td>\n",
" <td>3.000000</td>\n",
" <td>4.350000</td>\n",
" <td>1.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>6.400000</td>\n",
" <td>3.300000</td>\n",
" <td>5.100000</td>\n",
" <td>1.800000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>7.900000</td>\n",
" <td>4.400000</td>\n",
" <td>6.900000</td>\n",
" <td>2.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sepal.length sepal.width petal.length petal.width\n",
"count 150.000000 150.000000 150.000000 150.000000\n",
"mean 5.843333 3.057333 3.758000 1.199333\n",
"std 0.828066 0.435866 1.765298 0.762238\n",
"min 4.300000 2.000000 1.000000 0.100000\n",
"25% 5.100000 2.800000 1.600000 0.300000\n",
"50% 5.800000 3.000000 4.350000 1.300000\n",
"75% 6.400000 3.300000 5.100000 1.800000\n",
"max 7.900000 4.400000 6.900000 2.500000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the dataset into DataFrame\n",
"df=pd.read_csv('iris.csv')\n",
"df.describe() # Print description of DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7c813f91-5100-463a-a848-2ef9f46344bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 values:\n",
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"2 4.7 3.2 1.3 0.2 Setosa\n",
"3 4.6 3.1 1.5 0.2 Setosa\n",
"4 5.0 3.6 1.4 0.2 Setosa\n",
"Last 5 values:\n",
" sepal.length sepal.width petal.length petal.width variety\n",
"145 6.7 3.0 5.2 2.3 Virginica\n",
"146 6.3 2.5 5.0 1.9 Virginica\n",
"147 6.5 3.0 5.2 2.0 Virginica\n",
"148 6.2 3.4 5.4 2.3 Virginica\n",
"149 5.9 3.0 5.1 1.8 Virginica\n"
]
}
],
"source": [
"# Print first and last 5 values\n",
"print(\"First 5 values:\\n\", df.head())\n",
"print (\"Last 5 values:\\n\", df.tail())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "78fa6d44-7c39-4306-a5d7-0fd5ff94243a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"145 False\n",
"146 False\n",
"147 False\n",
"148 False\n",
"149 False\n",
"Length: 150, dtype: bool"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print duplicated values\n",
"df.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5da58b1d-c458-4eb5-b23c-053c02934efd",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal.length</th>\n",
" <th>sepal.width</th>\n",
" <th>petal.length</th>\n",
" <th>petal.width</th>\n",
" <th>variety</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>150 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 False False False False False\n",
"1 False False False False False\n",
"2 False False False False False\n",
"3 False False False False False\n",
"4 False False False False False\n",
".. ... ... ... ... ...\n",
"145 False False False False False\n",
"146 False False False False False\n",
"147 False False False False False\n",
"148 False False False False False\n",
"149 False False False False False\n",
"\n",
"[150 rows x 5 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print null values true/false\n",
"df.isnull()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3b6face6-d366-4a05-9fd4-baff133e24f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 150 entries, 0 to 149\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sepal.length 150 non-null float64\n",
" 1 sepal.width 150 non-null float64\n",
" 2 petal.length 150 non-null float64\n",
" 3 petal.width 150 non-null float64\n",
" 4 variety 150 non-null object \n",
"dtypes: float64(4), object(1)\n",
"memory usage: 6.0+ KB\n"
]
}
],
"source": [
"# Print summary of DataFrame\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f08a00a4-c82a-49a9-bbaa-427d5cc4db96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150, 5)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print shape, i.e. rows + columns\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "186866fe-7614-4a4d-b929-31bd37f80027",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"145 False\n",
"146 False\n",
"147 False\n",
"148 False\n",
"149 False\n",
"Name: sepal.length, Length: 150, dtype: bool"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print null (true/false) values in sepal.length column\n",
"df[\"sepal.length\"].isnull()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5bc1e082-9818-450d-b76d-5dd3a2103e00",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.width variety\n",
"0 5.1 3.5 0.2 Setosa\n",
"1 4.9 3.0 0.2 Setosa\n",
"2 4.7 3.2 0.2 Setosa\n",
"3 4.6 3.1 0.2 Setosa\n",
"4 5.0 3.6 0.2 Setosa\n",
".. ... ... ... ...\n",
"145 6.7 3.0 2.3 Virginica\n",
"146 6.3 2.5 1.9 Virginica\n",
"147 6.5 3.0 2.0 Virginica\n",
"148 6.2 3.4 2.3 Virginica\n",
"149 5.9 3.0 1.8 Virginica\n",
"\n",
"[150 rows x 4 columns]\n"
]
}
],
"source": [
"# Delete/Drop petal.length column\n",
"y = df.drop([\"petal.length\"], axis=1) # axis=1 column. For row, axis=0\n",
"print(y)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "dcccff24-c97e-4832-9cc1-cd701a3a9a34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sepal.length sepal.width petal.length petal.width variety\n",
"0 5.1 3.5 1.4 0.2 0\n",
"1 4.9 3.0 1.4 0.2 0\n",
"2 4.7 3.2 1.3 0.2 0\n",
"3 4.6 3.1 1.5 0.2 0\n",
"4 5.0 3.6 1.4 0.2 0\n",
".. ... ... ... ... ...\n",
"145 6.7 3.0 5.2 2.3 1\n",
"146 6.3 2.5 5.0 1.9 1\n",
"147 6.5 3.0 5.2 2.0 1\n",
"148 6.2 3.4 5.4 2.3 1\n",
"149 5.9 3.0 5.1 1.8 1\n",
"\n",
"[150 rows x 5 columns]\n"
]
}
],
"source": [
"# In variety column, replace Setosa with 0 and Virginica with 1\n",
"df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "97d22793-657e-4df7-a7f1-01d70886ef57",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sepal.length 0\n",
"sepal.width 0\n",
"petal.length 0\n",
"petal.width 0\n",
"variety 0\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print sum of NULL values in each column\n",
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"id": "e3f0c5c4-a930-4dbc-8300-99fb7cb7c991",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}