We will apply a wrapper selection method using forward search and decision trees as a model in the breast cancer Wisconsin dataset. First we will split the dataset into training and validation datasets.
data <- read.table('breast-cancer-wisconsin.data', na.strings = "?", sep=",")
data <- data[,-1]
names(data) <- c("ClumpThickness",
"UniformityCellSize",
"UniformityCellShape",
"MarginalAdhesion",
"SingleEpithelialCellSize",
"BareNuclei",
"BlandChromatin",
"NormalNucleoli",
"Mitoses",
"Class")
data$Class <- factor(data$Class, levels=c(2,4), labels=c("benign", "malignant"))
set.seed(1234)
ind <- sample(2, nrow(data), replace=TRUE, prob=c(0.7, 0.3))
trainData <- data[ind==1,]
validationData <- data[ind==2,]
# remove cases with missing data
trainData <- trainData[complete.cases(trainData),]
validationData <- validationData[complete.cases(validationData),]
We will use the forward.search
method of the FSelector
package and the rpart
decision trees training method of the homonymous package. The forward.search
method needs an evaluation function that evaluates a subset of attributes. In our case the evaluator
function performs 5-fold cross-validation on the training dataset.
library(FSelector)
library(rpart)
evaluator <- function(subset) {
#k-fold cross validation
k <- 5
splits <- runif(nrow(trainData))
results = sapply(1:k, function(i) {
test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
train.idx <- !test.idx
test <- trainData[test.idx, , drop=FALSE]
train <- trainData[train.idx, , drop=FALSE]
tree <- rpart(as.simple.formula(subset, "Class"), train)
error.rate = sum(test$Class != predict(tree, test, type="c")) / nrow(test)
return(1 - error.rate)
})
print(subset)
print(mean(results))
return(mean(results))
}
subset <- forward.search(names(trainData)[-10], evaluator)
## [1] "ClumpThickness"
## [1] 0.8535402
## [1] "UniformityCellSize"
## [1] 0.8982188
## [1] "UniformityCellShape"
## [1] 0.9132406
## [1] "MarginalAdhesion"
## [1] 0.8564007
## [1] "SingleEpithelialCellSize"
## [1] 0.9055005
## [1] "BareNuclei"
## [1] 0.9079083
## [1] "BlandChromatin"
## [1] 0.9027608
## [1] "NormalNucleoli"
## [1] 0.895089
## [1] "Mitoses"
## [1] 0.7942549
## [1] "ClumpThickness" "UniformityCellShape"
## [1] 0.9297849
## [1] "UniformityCellSize" "UniformityCellShape"
## [1] 0.9335359
## [1] "UniformityCellShape" "MarginalAdhesion"
## [1] 0.9167509
## [1] "UniformityCellShape" "SingleEpithelialCellSize"
## [1] 0.9404055
## [1] "UniformityCellShape" "BareNuclei"
## [1] 0.9384218
## [1] "UniformityCellShape" "BlandChromatin"
## [1] 0.9283328
## [1] "UniformityCellShape" "NormalNucleoli"
## [1] 0.944158
## [1] "UniformityCellShape" "Mitoses"
## [1] 0.9162663
## [1] "ClumpThickness" "UniformityCellShape" "NormalNucleoli"
## [1] 0.9405466
## [1] "UniformityCellSize" "UniformityCellShape" "NormalNucleoli"
## [1] 0.9322648
## [1] "UniformityCellShape" "MarginalAdhesion" "NormalNucleoli"
## [1] 0.9387149
## [1] "UniformityCellShape" "SingleEpithelialCellSize"
## [3] "NormalNucleoli"
## [1] 0.9314805
## [1] "UniformityCellShape" "BareNuclei" "NormalNucleoli"
## [1] 0.9470806
## [1] "UniformityCellShape" "BlandChromatin" "NormalNucleoli"
## [1] 0.9267064
## [1] "UniformityCellShape" "NormalNucleoli" "Mitoses"
## [1] 0.9401233
## [1] "ClumpThickness" "UniformityCellShape" "BareNuclei"
## [4] "NormalNucleoli"
## [1] 0.9335856
## [1] "UniformityCellSize" "UniformityCellShape" "BareNuclei"
## [4] "NormalNucleoli"
## [1] 0.9498017
## [1] "UniformityCellShape" "MarginalAdhesion" "BareNuclei"
## [4] "NormalNucleoli"
## [1] 0.9418352
## [1] "UniformityCellShape" "SingleEpithelialCellSize"
## [3] "BareNuclei" "NormalNucleoli"
## [1] 0.9415623
## [1] "UniformityCellShape" "BareNuclei" "BlandChromatin"
## [4] "NormalNucleoli"
## [1] 0.9506024
## [1] "UniformityCellShape" "BareNuclei" "NormalNucleoli"
## [4] "Mitoses"
## [1] 0.9382617
## [1] "ClumpThickness" "UniformityCellShape" "BareNuclei"
## [4] "BlandChromatin" "NormalNucleoli"
## [1] 0.9515459
## [1] "UniformityCellSize" "UniformityCellShape" "BareNuclei"
## [4] "BlandChromatin" "NormalNucleoli"
## [1] 0.930654
## [1] "UniformityCellShape" "MarginalAdhesion" "BareNuclei"
## [4] "BlandChromatin" "NormalNucleoli"
## [1] 0.9454745
## [1] "UniformityCellShape" "SingleEpithelialCellSize"
## [3] "BareNuclei" "BlandChromatin"
## [5] "NormalNucleoli"
## [1] 0.9385037
## [1] "UniformityCellShape" "BareNuclei" "BlandChromatin"
## [4] "NormalNucleoli" "Mitoses"
## [1] 0.935793
## [1] "ClumpThickness" "UniformityCellSize" "UniformityCellShape"
## [4] "BareNuclei" "BlandChromatin" "NormalNucleoli"
## [1] 0.9460072
## [1] "ClumpThickness" "UniformityCellShape" "MarginalAdhesion"
## [4] "BareNuclei" "BlandChromatin" "NormalNucleoli"
## [1] 0.9494458
## [1] "ClumpThickness" "UniformityCellShape"
## [3] "SingleEpithelialCellSize" "BareNuclei"
## [5] "BlandChromatin" "NormalNucleoli"
## [1] 0.9506376
## [1] "ClumpThickness" "UniformityCellShape" "BareNuclei"
## [4] "BlandChromatin" "NormalNucleoli" "Mitoses"
## [1] 0.939423
After the search we get the following formula, where 5 out of the 9 variables were kept.
f <- as.simple.formula(subset, "Class")
print(f)
## Class ~ ClumpThickness + UniformityCellShape + BareNuclei + BlandChromatin +
## NormalNucleoli
## <environment: 0x132db8c38>
We can use the Naive Bayes algorithm to evaluate the forward selection algorithm both in the training and the validation datasets under the accuracy metric.
library(e1071)
model <- naiveBayes(Class ~ ., data=trainData, laplace = 1)
simpler_model <- naiveBayes(f, data=trainData, laplace = 1)
pred <- predict(model, validationData)
simpler_pred <- predict(simpler_model, validationData)
library(MLmetrics)
train_pred <- predict(model, trainData)
train_simpler_pred <- predict(simpler_model, trainData)
paste("Accuracy in training all attributes",
Accuracy(train_pred, trainData$Class), sep=" - ")
## [1] "Accuracy in training all attributes - 0.957805907172996"
paste("Accuracy in training forward search attributes",
Accuracy(train_simpler_pred, trainData$Class), sep=" - ")
## [1] "Accuracy in training forward search attributes - 0.953586497890295"
paste("Accuracy in validation all attributes",
Accuracy(pred, validationData$Class), sep=" - ")
## [1] "Accuracy in validation all attributes - 0.976076555023923"
paste("Accuracy in validation forward search attributes",
Accuracy(simpler_pred, validationData$Class), sep=" - ")
## [1] "Accuracy in validation forward search attributes - 0.971291866028708"
In the breast cancer Wisconsin dataset, the feature selection algorithm did not outperform the use of all attributes. The obvious cause is that there 9 attributes are handpicked by domain experts and have indeed a predictive power all together. So removing some does not product better results.
One can also alter the forward.search
method with backward.search
to perform the backward search wrapper selection method. In fact in the case of backward.search
all attributes are kept.