{ "cells": [ { "cell_type": "markdown", "id": "5ed97617-c927-48b4-857c-c4d75b98fe2c", "metadata": {}, "source": [ "# Notebook-A1 (Data Wrangling-1)" ] }, { "cell_type": "code", "execution_count": 1, "id": "a93305d3-03c9-479e-bde3-fb0d06ea8d39", "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "id": "8bf62160-0384-4a3f-b25e-11e28c8b8df4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal.lengthsepal.widthpetal.lengthpetal.width
count150.000000150.000000150.000000150.000000
mean5.8433333.0573333.7580001.199333
std0.8280660.4358661.7652980.762238
min4.3000002.0000001.0000000.100000
25%5.1000002.8000001.6000000.300000
50%5.8000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000
\n", "
" ], "text/plain": [ " sepal.length sepal.width petal.length petal.width\n", "count 150.000000 150.000000 150.000000 150.000000\n", "mean 5.843333 3.057333 3.758000 1.199333\n", "std 0.828066 0.435866 1.765298 0.762238\n", "min 4.300000 2.000000 1.000000 0.100000\n", "25% 5.100000 2.800000 1.600000 0.300000\n", "50% 5.800000 3.000000 4.350000 1.300000\n", "75% 6.400000 3.300000 5.100000 1.800000\n", "max 7.900000 4.400000 6.900000 2.500000" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the dataset into DataFrame\n", "df=pd.read_csv('iris.csv')\n", "df.describe() # Print description of DataFrame" ] }, { "cell_type": "code", "execution_count": 4, "id": "7c813f91-5100-463a-a848-2ef9f46344bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "First 5 values:\n", " sepal.length sepal.width petal.length petal.width variety\n", "0 5.1 3.5 1.4 0.2 Setosa\n", "1 4.9 3.0 1.4 0.2 Setosa\n", "2 4.7 3.2 1.3 0.2 Setosa\n", "3 4.6 3.1 1.5 0.2 Setosa\n", "4 5.0 3.6 1.4 0.2 Setosa\n", "Last 5 values:\n", " sepal.length sepal.width petal.length petal.width variety\n", "145 6.7 3.0 5.2 2.3 Virginica\n", "146 6.3 2.5 5.0 1.9 Virginica\n", "147 6.5 3.0 5.2 2.0 Virginica\n", "148 6.2 3.4 5.4 2.3 Virginica\n", "149 5.9 3.0 5.1 1.8 Virginica\n" ] } ], "source": [ "# Print first and last 5 values\n", "print(\"First 5 values:\\n\", df.head())\n", "print (\"Last 5 values:\\n\", df.tail())" ] }, { "cell_type": "code", "execution_count": 5, "id": "78fa6d44-7c39-4306-a5d7-0fd5ff94243a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", " ... \n", "145 False\n", "146 False\n", "147 False\n", "148 False\n", "149 False\n", "Length: 150, dtype: bool" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print duplicated values\n", "df.duplicated()" ] }, { "cell_type": "code", "execution_count": 6, "id": "5da58b1d-c458-4eb5-b23c-053c02934efd", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal.lengthsepal.widthpetal.lengthpetal.widthvariety
0FalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalse
4FalseFalseFalseFalseFalse
..................
145FalseFalseFalseFalseFalse
146FalseFalseFalseFalseFalse
147FalseFalseFalseFalseFalse
148FalseFalseFalseFalseFalse
149FalseFalseFalseFalseFalse
\n", "

150 rows × 5 columns

\n", "
" ], "text/plain": [ " sepal.length sepal.width petal.length petal.width variety\n", "0 False False False False False\n", "1 False False False False False\n", "2 False False False False False\n", "3 False False False False False\n", "4 False False False False False\n", ".. ... ... ... ... ...\n", "145 False False False False False\n", "146 False False False False False\n", "147 False False False False False\n", "148 False False False False False\n", "149 False False False False False\n", "\n", "[150 rows x 5 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print null values true/false\n", "df.isnull()" ] }, { "cell_type": "code", "execution_count": 7, "id": "3b6face6-d366-4a05-9fd4-baff133e24f6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 150 entries, 0 to 149\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sepal.length 150 non-null float64\n", " 1 sepal.width 150 non-null float64\n", " 2 petal.length 150 non-null float64\n", " 3 petal.width 150 non-null float64\n", " 4 variety 150 non-null object \n", "dtypes: float64(4), object(1)\n", "memory usage: 6.0+ KB\n" ] } ], "source": [ "# Print summary of DataFrame\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 11, "id": "f08a00a4-c82a-49a9-bbaa-427d5cc4db96", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(150, 5)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print shape, i.e. rows + columns\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 12, "id": "186866fe-7614-4a4d-b929-31bd37f80027", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", " ... \n", "145 False\n", "146 False\n", "147 False\n", "148 False\n", "149 False\n", "Name: sepal.length, Length: 150, dtype: bool" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print null (true/false) values in sepal.length column\n", "df[\"sepal.length\"].isnull()" ] }, { "cell_type": "code", "execution_count": 17, "id": "5bc1e082-9818-450d-b76d-5dd3a2103e00", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sepal.length sepal.width petal.width variety\n", "0 5.1 3.5 0.2 Setosa\n", "1 4.9 3.0 0.2 Setosa\n", "2 4.7 3.2 0.2 Setosa\n", "3 4.6 3.1 0.2 Setosa\n", "4 5.0 3.6 0.2 Setosa\n", ".. ... ... ... ...\n", "145 6.7 3.0 2.3 Virginica\n", "146 6.3 2.5 1.9 Virginica\n", "147 6.5 3.0 2.0 Virginica\n", "148 6.2 3.4 2.3 Virginica\n", "149 5.9 3.0 1.8 Virginica\n", "\n", "[150 rows x 4 columns]\n" ] } ], "source": [ "# Delete/Drop petal.length column\n", "y = df.drop([\"petal.length\"], axis=1) # axis=1 column. For row, axis=0\n", "print(y)" ] }, { "cell_type": "code", "execution_count": 19, "id": "dcccff24-c97e-4832-9cc1-cd701a3a9a34", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sepal.length sepal.width petal.length petal.width variety\n", "0 5.1 3.5 1.4 0.2 0\n", "1 4.9 3.0 1.4 0.2 0\n", "2 4.7 3.2 1.3 0.2 0\n", "3 4.6 3.1 1.5 0.2 0\n", "4 5.0 3.6 1.4 0.2 0\n", ".. ... ... ... ... ...\n", "145 6.7 3.0 5.2 2.3 1\n", "146 6.3 2.5 5.0 1.9 1\n", "147 6.5 3.0 5.2 2.0 1\n", "148 6.2 3.4 5.4 2.3 1\n", "149 5.9 3.0 5.1 1.8 1\n", "\n", "[150 rows x 5 columns]\n" ] } ], "source": [ "# In variety column, replace Setosa with 0 and Virginica with 1\n", "df['variety'].replace(['Setosa', 'Virginica'], [0,1], inplace=True)\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 20, "id": "97d22793-657e-4df7-a7f1-01d70886ef57", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sepal.length 0\n", "sepal.width 0\n", "petal.length 0\n", "petal.width 0\n", "variety 0\n", "dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Print sum of NULL values in each column\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "id": "e3f0c5c4-a930-4dbc-8300-99fb7cb7c991", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }