I’m pretty new to machine learning and wanted to try my hand at this competition. I’ve received a score of 90.6% accuracy but I am unsure if this is accurate. My computer cannot handle the entire data set without pooping out (multinom doesn’t work) so I had to make a shortened data set of 500 observations. While I don’t believe I’ve lost statistical power, I would just like to know if anyone else can copy this same syntax, remove the "nrow = " function from the read.csv() command, and achieve a comparable level of accurately predicted values. Thanks so much! Also, sorry for the ugly formatting of the code - I’m not sure how to integrate the markdown into this discussion post
rm( list=ls())
if(!install.packages(‘car’)) { install.packages(‘car’) }
if(!install.packages(‘ggplot2’)) { install.packages(‘ggplot2’)}
install.packages(‘nnet’)
library(nnet)
library(car); library(ggplot2)
training set IV values
train1 <- read.csv( “your file path here\training set values.csv”,
nrow = 500) # need to cut some rows bc the large amt of cases was killing RStudio
head(train1, n=5)
training set labels (the DV for each observation in train1)
train2 <- read.csv(“your file path here\training set labels.csv”,
nrow = 500)
head(train2, n=5)
train.df <- merge(train1, train2, by = “id” )
rm(train1)
rm(train2)
train.df$funder <- as.character( train.df$funder )
train.df$installer <- as.character( train.df$installer )
train.df$funder <- toupper( train.df$funder ) # makes all observations uppercase
train.df$installer <- toupper( train.df$installer )
train.df$fun.ins.match <- (train.df$funder == train.df$installer) | ( # will print out either TRUE or FALSE
train.df$funder == “GOVERNMENT OF TANZANIA” & train.df$installer == “GOVER” ) | ( # if there are different variations of the same name
train.df$funder == “GOVERNMENT OF TANZANIA” & train.df$installer == “GOVERNMENT” ) |
substring( train.df$funder, 1, 5) == substring( train.df$installer, 1, 5) # if first 5 letters match
train.df$fun.ins.match <- car::recode( train.df$fun.ins.match, “‘FALSE’ = 0; ‘TRUE’ = 1” )
train.df$date_recorded <- substring( train.df$date_recorded, 1, 4) # keeps just the 1:4
values of the observations (which in this case are just the year numbers)
med1 <- median(train.df$construction_year, na.rm = TRUE )
train.df$construction_year <- car::recode( train.df$construction_year,
“0 = med1”)
train.df$date_recorded <- as.numeric( train.df$date_recorded )
train.df$yr.of.oper <- train.df$date_recorded - train.df$construction_year
train.df$public_meeting <- car::recode( train.df$public_meeting,
“‘True’ = 1; ‘False’ = 0” )
train.df$public_meeting <- as.numeric( as.character(train.df$public_meeting ))
train.df$public_meeting[is.na(train.df$public_meeting)] <- # recoding missing values to the median
median(train.df$public_meeting, na.rm = TRUE )
train.df$region_code <- train.df$region_code ^ 2
train.df$district_code <- train.df$district_code ^ 2
train.df$region.district <- train.df$region_code * train.df$district_code
train.df$region.district <- as.factor( train.df$region.district )
train.df$ward <- as.factor( train.df$ward )
train.df$scheme_name <- toupper( train.df$scheme_name )
train.df$permit <- car::recode( train.df$permit,
“‘True’ = 1; ‘False’ = 0” )
train.df$funder <- as.factor( train.df$funder )
train.df$installer <- as.factor( train.df$installer )
train.df$wpt_name <- as.factor( train.df$wpt_name )
train.df$scheme_name <- as.factor( train.df$scheme_name )
train.df$status_group <- car::recode( train.df$status_group,
“‘functional’ = 1;
‘functional needs repair’ = 2;
‘non functional’ = 3” )
train.df$payment <- car::recode( train.df$payment,
“‘pay when scheme fails’ = ‘on failure’;
‘pay monthly’ = ‘monthly’;
‘pay per bucket’ = ‘per bucket’;
‘pay annually’ = ‘annually’”)
train.df$payment <- as.character( train.df$payment )
train.df$payment_type <- as.character( train.df$payment_type )
nrow( train.df[train.df$payment != train.df$payment_type,]) # no. of rows when “payment” != “payment type”
equals 0, so we will remove “payment_type”
train.df$payment_type <- NULL
train.df$payment <- as.factor( train.df$payment )
summary(train.df$water_quality)
summary(train.df$quality_group) # can be changed to numeric/ordinal
str(train.df$quality_group)
train.df$quality_group <- as.character( train.df$quality_group )
train.df$quality_group <- car::recode( train.df$quality_group,
“‘unknown’ = 0;
‘colored’ = 1;
‘milky’ = 2;
‘salty’ = 3;
‘good’ = 4;
‘fluoride’ = 5”)
train.df$quality_group <- as.numeric( train.df$quality_group )
str(train.df$quality_group)
nrow(train.df[train.df$quantity != train.df$quantity_group,]) # = 0 so I’ll remove one of them
train.df$quantity_group <- NULL
the model
summary( mlr1 <-
multinom( status_group~
yr.of.oper+
fun.ins.match+
gps_height+
region.district+
scheme_name+
permit+
extraction_type+
management+
payment+
quality_group+
source,
data = train.df ))
train.df$preds <- predict( mlr1, train.df )
print( pred.test1 <-
nrow(train.df[train.df$status_group == train.df$preds, ]) # no. of rows where preds == actual for status
)
100 * ( pred.test1 / nrow(train.df)) # percentage of correct predictions