The Iris dataset, collected by Edgar Anderson, an American botanist, contains measurements of sepal length, sepal width, petal length, and petal width for 150 Iris flowers (50 each from three species: setosa, virginica, and versicolor). Ronald Fisher, a British polymath who was active as a mathematician, statistician, biologist, geneticist, and academic, used this dataset to demonstrate linear discriminant analysis, a statistical technique to classify species based on their features.
Exhibit 25.59 demonstrates the analysis and visualization of the Iris data using Python’s Pandas and Matplotlib libraries.
import pandas as pd
# Read data from iris.csv
iris = pd.read_csv('data/iris.csv',
names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
print(iris) # print the first and last five rows of the dataset
sepal_length sepal_width petal_length petal_width class 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa .. ... ... ... ... ... 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica [150 rows x 5 columns]
# Examine the structure of the dataset
print(iris.shape) # The shape property returns a tuple containing the number of rows and columns of the DataFrame
print(iris.shape[0]) # The number of rows
# correlation to examine the linear relationship between the data fields
iris_new = iris.drop('class', axis=1) # drop column class. axis 0 is row and axis 1 is col
iris_new.corr() # correlation
(150, 5) 150
# Filter Data
iris[iris['petal_length']==1.5] # filter rows where petal_length is 1.5
# Create dataframes iris_setosa, iris_versicolor and iris_virginica by filtering the class column
iris_setosa = iris[iris['class']=='Iris-setosa'] # filter iris Setosa
iris_versicolor = iris[iris['class']=='Iris-versicolor'] # filter iris versicolor
iris_virginica = iris[iris['class']=='Iris-virginica'] # filter iris virginica
print(iris_setosa.head(8)) # print top 8 rows
print(iris_versicolor.head(10)) # print top 10 rows
print(iris_virginica.head()) # print top 5 (default value) rows
sepal_length sepal_width petal_length petal_width class 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa 5 5.4 3.9 1.7 0.4 Iris-setosa 6 4.6 3.4 1.4 0.3 Iris-setosa 7 5.0 3.4 1.5 0.2 Iris-setosa sepal_length sepal_width petal_length petal_width class 50 7.0 3.2 4.7 1.4 Iris-versicolor 51 6.4 3.2 4.5 1.5 Iris-versicolor 52 6.9 3.1 4.9 1.5 Iris-versicolor 53 5.5 2.3 4.0 1.3 Iris-versicolor 54 6.5 2.8 4.6 1.5 Iris-versicolor 55 5.7 2.8 4.5 1.3 Iris-versicolor 56 6.3 3.3 4.7 1.6 Iris-versicolor 57 4.9 2.4 3.3 1.0 Iris-versicolor 58 6.6 2.9 4.6 1.3 Iris-versicolor 59 5.2 2.7 3.9 1.4 Iris-versicolor sepal_length sepal_width petal_length petal_width class 100 6.3 3.3 6.0 2.5 Iris-virginica 101 5.8 2.7 5.1 1.9 Iris-virginica 102 7.1 3.0 5.9 2.1 Iris-virginica 103 6.3 2.9 5.6 1.8 Iris-virginica 104 6.5 3.0 5.8 2.2 Iris-virginica
# default plot is a line chart
ax = iris.plot() # assign to variable ax
# Use ax to add titles and labels
ax.set_title("Iris Dataset")
ax.set_xlabel("Data Points")
ax.set_ylabel("Length/Width (mm)")
# Histogram
iris["sepal_length"].plot(kind = 'hist') # histogram
# Matplotlib — for Visualization
import matplotlib.pyplot as plt
# Create subplots with one row and three columns
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
iris_setosa = iris[iris['class']=='Iris-setosa'] # filter iris Setosa
iris_versicolor = iris[iris['class']=='Iris-versicolor'] # filter iris versicolor
iris_virginica = iris[iris['class']=='Iris-virginica'] # filter iris virginica
# Plot histograms on each subplot
'''
The alpha parameter adjusts the transparency of the elements.
It takes values between 0 (completely transparent) and 1 (completely opaque).
'''
axes[0].hist(iris_setosa["sepal_length"], bins=20, color='blue', alpha=0.7)
axes[0].set_title('Iris Setosa - Sepal Length')
axes[0].set_ylabel('Frequency')
axes[1].hist(iris_versicolor["sepal_length"], bins=20, color='green', alpha=0.7)
axes[1].set_title('Iris Versicolor - Sepal Length')
axes[2].hist(iris_virginica["sepal_length"], bins=20, color='orange', alpha=0.7)
axes[2].set_title('Iris Virginica - Sepal Length')
# Adjust layout to prevent clipping of titles
plt.tight_layout()
# Show the plots
plt.show()
# creating a scatter plot
# fig: figure. ax: list (array) of subplot
fig, ax = plt.subplots()
# scatter the sepal_length against the sepal_width
ax.scatter(iris['sepal_length'], iris['sepal_width'])
# set a title and labels
ax.set_title('Iris Dataset')
ax.set_xlabel('sepal_length')
ax.set_ylabel('sepal_width')
# Visual examination of the scatter plot reveals 2 or 3 clusters
# Subplots of the three species: setosa, virginica, and versicolor
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 5))
iris_setosa = iris[iris['class']=='Iris-setosa'] # filter iris Setosa
iris_versicolor = iris[iris['class']=='Iris-versicolor'] # filter iris versicolor
iris_virginica = iris[iris['class']=='Iris-virginica'] # filter iris virginica
# First row of charts - histograms
axes[0,0].hist(iris_setosa["sepal_length"], bins=20, color='blue', alpha=0.7)
axes[0,0].set_title('Iris Setosa - Sepal Length')
axes[0,0].set_ylabel('Frequency')
axes[0,1].hist(iris_versicolor["sepal_length"], bins=20, color='green', alpha=0.7)
axes[0,1].set_title('Iris Versicolor - Sepal Length')
axes[0,2].hist(iris_virginica["sepal_length"], bins=20, color='orange', alpha=0.7)
axes[0,2].set_title('Iris Virginica - Sepal Length')
# Second row of charts - scatter plots
axes[1,0].scatter(iris_setosa["sepal_length"], iris_setosa["sepal_width"])
axes[1,0].set_title('Iris Setosa')
axes[1,0].set_xlabel('Sepal Length')
axes[1,0].set_ylabel('Sepal Width')
axes[1,1].scatter(iris_versicolor["sepal_length"], iris_versicolor["sepal_width"])
axes[1,1].set_title('Iris Versicolor')
axes[1,1].set_xlabel('Sepal Length')
axes[1,1].set_ylabel('Sepal Width')
axes[1,2].scatter(iris_virginica["sepal_length"], iris_virginica["sepal_width"])
axes[1,2].set_title('Iris Virginica')
axes[1,2].set_xlabel('Sepal Length')
axes[1,2].set_ylabel('Sepal Width')
# Adjust layout to prevent clipping of titles
plt.tight_layout()
# Show the plots
plt.show()
import numpy as np
# Save correlation matrix into variable corr
corr = iris_new.corr()
print(corr)
fig, ax = plt.subplots()
# create heatmap
'''
imshow: is specifically designed to display 2D arrays as images,
where the color represents the data's magnitude, which fits the
requirements for visualizing a correlation matrix effectively.
.values: The '.values' attribute returns only the numerical values
without any row or column labels, in the form of a NumPy array.
'''
ax.imshow(corr.values)
sepal_length sepal_width petal_length petal_width sepal_length 1.000000 -0.109369 0.871754 0.817954 sepal_width -0.109369 1.000000 -0.420516 -0.356544 petal_length 0.871754 -0.420516 1.000000 0.962757 petal_width 0.817954 -0.356544 0.962757 1.000000
# set labels
'''
arange(): This is a function from the NumPy library that generates an array of evenly
spaced values within a specified range.
np.arange(start, stop, step): This is the basic syntax though you can omit the start
and step values to use their defaults (start=0 and step=1).
np.arange(len(corr.columns)): generates an array of integers starting from 0 up to
(but not including) len(corr.columns).
'''
ax.set_xticks(np.arange(len(corr.columns))) # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
ax.set_yticks(np.arange(len(corr.columns))) # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
ax.set_xticklabels(corr.columns) # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
ax.set_yticklabels(corr.columns) # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Loop over data dimensions to create text annotations.
for i in range(len(corr.columns)): # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
for j in range(len(corr.columns)): # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
# np.around: round by 2 decimal places
ax.text(j, i, np.around(corr.iloc[i, j], decimals=2),
ha="center", va="center", color="black")
Use the Search Bar to find content on MarketingMind.
Contact | Privacy Statement | Disclaimer: Opinions and views expressed on www.ashokcharan.com are the author’s personal views, and do not represent the official views of the National University of Singapore (NUS) or the NUS Business School | © Copyright 2013-2024 www.ashokcharan.com. All Rights Reserved.