In [1]:
import pandas as pd
from matplotlib import pyplot as plt

Data analysis

In [2]:
data=pd.read_csv("iris.csv")
data
Out[2]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ... ...
145 146 6.7 3.0 5.2 2.3 Iris-virginica
146 147 6.3 2.5 5.0 1.9 Iris-virginica
147 148 6.5 3.0 5.2 2.0 Iris-virginica
148 149 6.2 3.4 5.4 2.3 Iris-virginica
149 150 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 6 columns

In [3]:
# we haveto predict the class lable ie. Species wrt the 4 data points | Specied id dependent variable
#(SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm)

Data cleaning

In [4]:
# no of indepenent data for each Species   
data["Species"].value_counts()
Out[4]:
Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: Species, dtype: int64
In [5]:
# its a balanced dataset with each of 50 points
In [6]:
data.plot(x='SepalLengthCm',y='SepalWidthCm')
data
Out[6]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ... ...
145 146 6.7 3.0 5.2 2.3 Iris-virginica
146 147 6.3 2.5 5.0 1.9 Iris-virginica
147 148 6.5 3.0 5.2 2.0 Iris-virginica
148 149 6.2 3.4 5.4 2.3 Iris-virginica
149 150 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 6 columns

In [7]:
# instead using default line graph, we use scatter plot

visilisation

In [8]:
data.plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm');
data         # 2-d array , sl and sw
Out[8]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ... ...
145 146 6.7 3.0 5.2 2.3 Iris-virginica
146 147 6.3 2.5 5.0 1.9 Iris-virginica
147 148 6.5 3.0 5.2 2.0 Iris-virginica
148 149 6.2 3.4 5.4 2.3 Iris-virginica
149 150 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 6 columns

In [9]:
# we have 4 independent variables ie we have 4c2 = 6     (4=independent variable 2= x,y axis) 
#sl=sw, sl=pw, sl=pl, sw=pl, sw=pw, pl=pw

Pair plot

In [17]:
import seaborn as sns
sns.pairplot(data,hue="Species" ,vars = ["SepalLengthCm", "SepalWidthCm","PetalLengthCm","PetalWidthCm"])
plt.show()
In [11]:
# we have much clear picture in 12 plot ie (pl and pw)
In [12]:
# ploting histogram, pdf cdf, boxplot, violon for better understanding

observation in cm

if PetalLength is<=2 and PetalWidth is<=1 result is Iris Setosa

if PetalLength is<=5 and >=3 and PetalWidth is<=2 and >=1 result is Iris Versicolor

if PetalLength is<=7 and >=5 and PetalWidth is<=3 and >=1-5 result is Iris Virginica

In [35]:
sns.pairplot(data,hue="Species" ,vars = ["PetalWidthCm"])     

plt.show()

data pre processing

In [14]:
data.corr()       #corelation
Out[14]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
Id 1.000000 0.716676 -0.397729 0.882747 0.899759
SepalLengthCm 0.716676 1.000000 -0.109369 0.871754 0.817954
SepalWidthCm -0.397729 -0.109369 1.000000 -0.420516 -0.356544
PetalLengthCm 0.882747 0.871754 -0.420516 1.000000 0.962757
PetalWidthCm 0.899759 0.817954 -0.356544 0.962757 1.000000
In [15]:
corr=data.corr()
fig, axis=plt.subplots(figsize=(10,10))
sns.heatmap(corr,annot=True)              # ploting corelation 
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f1d2688>

observation= 0.96 or 96% is the highest corelation value

Machine Learning

In [16]:
# lable encoder used to convert the dependent column ie Specirs into numeric values to Machine readable form
In [ ]: