Image Source
Exploratory Data Analysis
dim(iris) #Checking dimensions, iris data have 150 observations & 6 features
## [1] 150 6
Checking count of each species
table(iris$Species) #Have equal number of each
##
## Iris-setosa Iris-versicolor Iris-virginica
## 50 50 50
Checking iris dataset summary
summary(iris) #Checking data summary, there are no missing values in data
## Id SepalLengthCm SepalWidthCm PetalLengthCm
## Min. : 1.00 Min. :4.300 Min. :2.000 Min. :1.000
## 1st Qu.: 38.25 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600
## Median : 75.50 Median :5.800 Median :3.000 Median :4.350
## Mean : 75.50 Mean :5.843 Mean :3.054 Mean :3.759
## 3rd Qu.:112.75 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100
## Max. :150.00 Max. :7.900 Max. :4.400 Max. :6.900
## PetalWidthCm Species
## Min. :0.100 Iris-setosa :50
## 1st Qu.:0.300 Iris-versicolor:50
## Median :1.300 Iris-virginica :50
## Mean :1.199
## 3rd Qu.:1.800
## Max. :2.500
Checking Iris dataset structure
str(iris) #Iris data have 4 key flower features as numeric type corresponding to 3 Species: setosa,versicolor,virginica
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "Iris-setosa",..: 1 1 1 1 1 1 1 1 1 1 ...
Checking for outliers
boxplot(iris$SepalLengthCm,iris$SepalWidthCm,iris$PetalLengthCm,iris$PetalWidthCm)
Checking outliers in SepalWidth
ggplot(iris, aes(x=SepalWidthCm, y=Id, color=Species)) + geom_point()
Average length & width by species
temp_df <- iris %>% group_by(Species) %>% summarize(mean(SepalLengthCm),mean(SepalWidthCm),mean(PetalLengthCm),mean(PetalWidthCm))
kable(temp_df,align = 'c',col.names = c('Species','Avg Sepal Length','Avg Sepal Width','Avg Petal Length','Avg Petal Width'))
Species | Avg Sepal Length | Avg Sepal Width | Avg Petal Length | Avg Petal Width |
---|---|---|---|---|
Iris-setosa | 5.006 | 3.418 | 1.464 | 0.244 |
Iris-versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
Iris-virginica | 6.588 | 2.974 | 5.552 | 2.026 |
Scatter plot between Sepal Length & Sepal Width:
#plot(iris$SepalLengthCm,iris$SepalWidthCm)
ggplot(iris, aes(x=SepalLengthCm, y=SepalWidthCm, color=Species)) + geom_point() + labs(title="Scatterplot", x="Sepal Length", y="Sepal Width")
Scatter plot between Petal Length & Petal Width:
ggplot(iris, aes(x=PetalLengthCm, y=PetalWidthCm, color=Species)) + geom_point() + labs(title="Scatterplot", x="Petal Length", y="Petal Width")
Plotting all numeric features
ggpairs(data=iris,columns=2:5,title="Iris Flower Features",colour='Species')
Including Density in plots
ggpairs(data=iris,
columns=2:5,
upper = list(continuous = "density"),
lower = list(combo = "facetdensity"),
colour = "Species")
pairs(iris[,1:4], col=iris$Species) #Trying pairs to get distinct colours for each cluster
3D visualization with 4 features:
colors <- c("#BB0000", "#00FFFF", "#7FFF00")
colors <- colors[as.numeric(iris$Species)]
scatterplot3d(iris[,2:4], pch=20, color=colors,grid=TRUE, box=FALSE,angle =80,xlab="Sepal Length", ylab="Sepal Width", zlab="Petal Length")
Sepal Dimensions Variation across dataset
p1<-ggplot(iris, aes(x=Id, y=SepalWidthCm, color=Species)) + geom_point() + geom_smooth()
p2<-ggplot(iris, aes(x=Id, y=SepalLengthCm, color=Species)) + geom_point() + geom_smooth()
grid.arrange(p1, p2, nrow=2)
Petal Dimensions Variation across dataset
p3<-ggplot(iris, aes(x=Id, y=PetalWidthCm, color=Species)) + geom_point() + geom_smooth()
p4<-ggplot(iris, aes(x=Id, y=PetalLengthCm, color=Species)) + geom_point() + geom_smooth()
grid.arrange(p3, p4, nrow=2)
Checking Pearson Correlation heat map among numeric features:
heat-map #check rmd code for details
Testing different machine learning algorithms on iris dataset
1.Applying k-means clustering
Removing Id as its not required for analysis
iris<-iris[,-c(1)]
iris_subset <-iris #Creating data subset for clustering
clus1 <- kmeans(iris_subset[,-c(5)], centers = 3, iter.max = 50, nstart = 50)
iris_subset <- cbind(iris_subset,clus1$cluster)
iris_subset$`clus1$cluster`<-as.factor(iris_subset$`clus1$cluster`)
colnames(iris_subset)[5]<- "ClusterID"
ggplot(iris_subset, aes(x=PetalLengthCm, y=PetalWidthCm, color=ClusterID)) + geom_point()
2.Applying Decision Trees
iris_subset <-iris #Creating data subset for classification
#Dividing data in train & test samples
set.seed(123)
split.indices <- sample(nrow(iris_subset), nrow(iris_subset)*0.8, replace = F)
train <- iris_subset[split.indices, ]
test <- iris_subset[-split.indices, ]
tree.model <- rpart(Species ~ .,data = train,method = "class",parms = list(split = "information"))
prp(tree.model)
Making predictions on test set using decision tree model:
tree.predict <- predict(tree.model, test[,-c(5)], type = "class")
confusionMatrix(test$Species, tree.predict)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 8 0 0
## Iris-versicolor 0 9 0
## Iris-virginica 0 1 12
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 5.303e-11
##
## Kappa : 0.9492
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Iris-setosa Class: Iris-versicolor
## Sensitivity 1.0000 0.9000
## Specificity 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000
## Neg Pred Value 1.0000 0.9524
## Prevalence 0.2667 0.3333
## Detection Rate 0.2667 0.3000
## Detection Prevalence 0.2667 0.3000
## Balanced Accuracy 1.0000 0.9500
## Class: Iris-virginica
## Sensitivity 1.0000
## Specificity 0.9444
## Pos Pred Value 0.9231
## Neg Pred Value 1.0000
## Prevalence 0.4000
## Detection Rate 0.4000
## Detection Prevalence 0.4333
## Balanced Accuracy 0.9722
3.Applying Hierarchical Clustering
iris_subset <-iris #Creating data subset for clustering
iris_dist <- dist(iris_subset[,-c(5)]) #calculating distance matrix for features
iris_hclust1<- hclust(iris_dist, method="complete")
plot(iris_hclust1)
Colored marking of 3 main clusters
clus_cut<-cutree(iris_hclust1, 3)
ColorDendrogram(iris_hclust1, y=clus_cut, labels = names(clus_cut), branchlength = 80)
Getting 3 main clusters & plotting them:
clusterCut <- cutree(iris_hclust1, k=3)
iris_subset <-cbind(iris,clusterCut)
colnames(iris_subset)[6]<- "ClusterID"
iris_subset$ClusterID<-as.factor(iris_subset$ClusterID)
ggplot(iris_subset, aes(x=PetalLengthCm, y=PetalWidthCm, color=ClusterID)) + geom_point()
4.Applying K Nearest Neighbor Algorithm
iris_knn_pred <- knn(train = train[,-c(5)], test[,-c(5)], cl= train$Species,k = 3,prob=TRUE)
confusionMatrix(test$Species, iris_knn_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 8 0 0
## Iris-versicolor 0 9 0
## Iris-virginica 0 1 12
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 5.303e-11
##
## Kappa : 0.9492
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Iris-setosa Class: Iris-versicolor
## Sensitivity 1.0000 0.9000
## Specificity 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000
## Neg Pred Value 1.0000 0.9524
## Prevalence 0.2667 0.3333
## Detection Rate 0.2667 0.3000
## Detection Prevalence 0.2667 0.3000
## Balanced Accuracy 1.0000 0.9500
## Class: Iris-virginica
## Sensitivity 1.0000
## Specificity 0.9444
## Pos Pred Value 0.9231
## Neg Pred Value 1.0000
## Prevalence 0.4000
## Detection Rate 0.4000
## Detection Prevalence 0.4333
## Balanced Accuracy 0.9722
5.Applying Naive Bayes Algorithm
iris_nb = train(iris[,-c(5)],iris$Species,'nb',trControl=trainControl(method='cv',number=10))
iris_nb
## Naive Bayes
##
## 150 samples
## 4 predictor
## 3 classes: 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9466667 0.92
## TRUE 0.9533333 0.93
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE
## and adjust = 1.
table(predict(iris_nb$finalModel,test[,-c(6)])$class,test$Species)
##
## Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 8 0 0
## Iris-versicolor 0 9 1
## Iris-virginica 0 0 12
6.Applying Support Vector Machine Algorithm
svm_model <- svm(Species ~ ., data=train)
summary(svm_model)
##
## Call:
## svm(formula = Species ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 47
##
## ( 7 19 21 )
##
##
## Number of Classes: 3
##
## Levels:
## Iris-setosa Iris-versicolor Iris-virginica
Making predictions on test data:
svm_pred <- predict(svm_model,test[,-c(5)])
table(svm_pred,test[,c(5)])
##
## svm_pred Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 8 0 0
## Iris-versicolor 0 9 1
## Iris-virginica 0 0 12
Thanks for reading, I am a novice in data analysis & still learning, so please provide your valuable feedback on errors & improvements