Next assignment for class is a progress report. The code I included in that report is below. Includes my AnomalyDetection attempts and more:
library(dplyr)
library(tidyr)
library(readr)
library(zoo)
library(devtools)
# devtools::install_github("twitter/AnomalyDetection")
library(AnomalyDetection)
library(randomForest)
# A selected time series of consumption data for selected meters within 3 different sites.
system.time(train <- read_csv("DrivenData_PowerLaws_DetectingAnomaliesInUsage/train.csv", col_types="ic?n"))
names(train)[1] <- "obs_id"
print(paste("Initial read of train.csv, dimensions:", dim(train)[1], "x", dim(train)[2]))
# Additional information about the included buildings.
building <- read_csv("DrivenData_PowerLaws_DetectingAnomaliesInUsage/metadata.csv")
# join
train <- left_join(train, building, by="meter_id")
# convert Wh to kWh
train$Values[train$units=="Wh" &
!is.na(train$units)] <- train$Values[train$units=="Wh" &
!is.na(train$units)]/1000
train$units[train$units=="Wh" &
!is.na(train$units)] <- "kWh" # and adjust the unit
# Historical Weather Data
weather <- read_csv("DrivenData_PowerLaws_DetectingAnomaliesInUsage/weather.csv", col_types="i?nnc")
weather$site_id[weather$site_id=="38"] <- "038" #the leading zero appears in the main dataset
# TODO group by site, average, join to train, check for NAs
# Public Holidays
holidays <- read_csv("DrivenData_PowerLaws_DetectingAnomaliesInUsage/holidays.csv")
# The format for the submission to the competition.
# We only are evaluating is_abnormal against a subset of the meter_ids that are in the consumption
# dataset. Competitors can use the entire consumption dataset to create their algorithms, but
# should only submit the meters and times that appear in the submission format.
test <- read_csv("DrivenData_PowerLaws_DetectingAnomaliesInUsage/submission_format.csv")
test_meters <- levels(as.factor(test$meter_id)) # OK, so what meters are in in submission_format file?print(test_meters)
for (i in 1:length(test_meters)) {
plot(select(filter(train, meter_id==test_meters[i]), Timestamp, Values),
main=paste("Energy Readings, Site", test_meters[i]),
xlab="Date",
ylab="Energy Usage (kWh)")
}
for (i in 1:length(test_meters)) {
print(paste(test_meters[i],
"has",
sum(is.na(train$Values[train$meter_id==test_meters[i]])),
"NAs"))
}
# We could look to another meter in the building for a good correlation:
filter(train, meter_id %in% c(test_meters[2], "2")) %>% select(Values, Timestamp, meter_id) %>% spread(meter_id, Values) %>% na.omit() %>% select(-Timestamp) %>% cor()
# But, the oher meter might also have NAs at the same Timestamp:
filter(train, meter_id %in% c(test_meters[2], "2")) %>% select(Values, Timestamp, meter_id) %>% spread(meter_id, Values) %>% tail()
# that there are NAs at the end of the dataset also presents problems for interpolating to replace NAs
# For thresholding on deltas, we can just fill the NAs with the preceding value:
train_locf <- select(train, meter_id, Timestamp, Values) %>% filter(meter_id %in% test_meters)
for (i in 1:length(test_meters)) {
print(paste("meter_id: ", test_meters[i]))
print(paste("NAs:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- na.locf(object=
train_locf$Values[train_locf$meter_id==test_meters[i]],
fromLast=FALSE,
na.rm=FALSE)
print(paste("NAs after LOCF:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- na.locf(object=
train_locf$Values[train_locf$meter_id==test_meters[i]],
fromLast=TRUE,
na.rm=FALSE)
print(paste("NAs after NOCB:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
}
# what if we set everything to is_abnormal = TRUE?
test_allTrue <- test
test_allTrue$is_abnormal <- TRUE
write_csv(test_allTrue, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_allTrue.csv")
train_locf$Values_cs <- NA
train_locf$six_sigma_increase <- NA
train_locf$six_sigma_decrease <- NA
train_locf$three_sigma_increase <- NA
train_locf$three_sigma_decrease <- NA
train_locf$is_abnormal <- NA
for(i in 1:length(test_meters)) {
values <- unlist(select(filter(train_locf, meter_id==test_meters[i]), Values))
diffs <- c(0, diff(values, lag=1))
if(sum(diffs[-1] >= 0)/length(diffs[-1]) > 0.99) {
# if the deltas are >>50% positive (or zero), then it's probably cumulative
train_locf$Values_cs[train_locf$meter_id==test_meters[i]] <- train_locf$Values[train_locf$meter_id==test_meters[i]]
if(any(diffs<0)) {
# if the deltas are <100% postive (or zero), then it probably wraps-around
wraps <- which(diffs<0)
limit <- 10^round(log10(max(values)))
print(values[wraps])
for(w in 1:length(wraps)) {
print(paste("Correcting wrap", i, "onwards"))
values[wraps[w]:length(values)] <- values[wraps[w]:length(values)] + limit
print(values[wraps])
}
values <- c(0, diff(values, lag=1))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- values
diffs <- c(0, diff(values, lag=1))
} else {
values <- c(0, diff(values, lag=1))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- values
diffs <- c(0, diff(values, lag=1))
}
} else {
train_locf$Values_cs[train_locf$meter_id==test_meters[i]] <- cumsum(train_locf$Values[train_locf$meter_id==test_meters[i]])
}
plot(diffs, main=paste("Reading-to-Reading Deltas for", test_meters[i]))
abline(h=mean(diffs, na.rm=TRUE), col="red", lwd=2)
abline(h=(mean(diffs, na.rm=TRUE) - seq(from=-10, to=10) * sd(diffs, na.rm=TRUE)), col="red", lty=2)
#abline(h=(mean(diffs, na.rm=TRUE) - seq(from=-3, to=3) * sd(values, na.rm=TRUE)), col="blue", lty=2)
train_locf$six_sigma_increase[train_locf$meter_id==test_meters[i]] <- as.logical(diffs > 6*sd(diffs, na.rm=TRUE))
train_locf$six_sigma_decrease[train_locf$meter_id==test_meters[i]] <- as.logical(diffs <= -6*sd(diffs, na.rm=TRUE))
train_locf$three_sigma_increase[train_locf$meter_id==test_meters[i]] <- as.logical(diffs > 3*sd(diffs, na.rm=TRUE))
train_locf$three_sigma_decrease[train_locf$meter_id==test_meters[i]] <- as.logical(diffs <= -3*sd(diffs, na.rm=TRUE))
rm(list=c("values", "diffs"))
}
par(mfrow=c(1,3))
# Six Sigma Upward Diffs
train_locf$is_abnormal <- train_locf$six_sigma_increase
test_six_sigma_upward_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal), by=c("meter_id", "Timestamp"))
sum(test_six_sigma_upward_diffs$is_abnormal)
write_csv(test_six_sigma_upward_diffs, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_six_sigma_upward_diffs.csv")
test_six_sigma_upward_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal, Values), by=c("meter_id", "Timestamp"))
for (i in 1:length(test_meters)) {
plot(select(filter(test_six_sigma_upward_diffs, meter_id==test_meters[i] & is_abnormal==FALSE), Timestamp, Values),
main=paste("6 Sigma Diffs (Up)", test_meters[i]),
col="dark green")
points(select(filter(test_six_sigma_upward_diffs, meter_id==test_meters[i] & is_abnormal==TRUE), Timestamp, Values),
col="red")
}
# Six Sigma Diffs
train_locf$is_abnormal <- train_locf$six_sigma_increase | train_locf$six_sigma_decrease
test_six_sigma_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal), by=c("meter_id", "Timestamp"))
sum(test_six_sigma_diffs$is_abnormal)
write_csv(test_six_sigma_diffs, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_six_sigma_diffs.csv")
test_six_sigma_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal, Values), by=c("meter_id", "Timestamp"))
for (i in 1:length(test_meters)) {
plot(select(filter(test_six_sigma_diffs, meter_id==test_meters[i] & is_abnormal==FALSE), Timestamp, Values),
main=paste("6 Sigma Diffs (All)", test_meters[i]),
col="dark green")
points(select(filter(test_six_sigma_diffs, meter_id==test_meters[i] & is_abnormal==TRUE), Timestamp, Values),
col="red")
}
# Three Sigma Upward Diffs
train_locf$is_abnormal <- train_locf$three_sigma_increase
test_three_sigma_upward_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal), by=c("meter_id", "Timestamp"))
sum(test_three_sigma_upward_diffs$is_abnormal)
write_csv(test_three_sigma_upward_diffs, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_three_sigma_upward_diffs.csv")
test_three_sigma_upward_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal, Values), by=c("meter_id", "Timestamp"))
for (i in 1:length(test_meters)) {
plot(select(filter(test_three_sigma_upward_diffs, meter_id==test_meters[i] & is_abnormal==FALSE), Timestamp, Values),
main=paste("3 Sigma Diffs (Up)", test_meters[i]),
col="dark green")
points(select(filter(test_three_sigma_upward_diffs, meter_id==test_meters[i] & is_abnormal==TRUE), Timestamp, Values),
col="red")
}
# Three Sigma Diffs
train_locf$is_abnormal <- train_locf$three_sigma_increase | train_locf$three_sigma_decrease
test_three_sigma_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal), by=c("meter_id", "Timestamp"))
sum(test_three_sigma_diffs$is_abnormal)
write_csv(test_three_sigma_diffs, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_three_sigma_diffs.csv")
test_three_sigma_diffs <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal, Values), by=c("meter_id", "Timestamp"))
for (i in 1:length(test_meters)) {
plot(select(filter(test_three_sigma_diffs, meter_id==test_meters[i] & is_abnormal==FALSE),
Timestamp, Values),
main=paste("3 Sigma Diffs (All)", test_meters[i]),
col="dark green")
points(select(filter(test_three_sigma_diffs, meter_id==test_meters[i] & is_abnormal==TRUE),
Timestamp, Values),
col="red")
}
par(mfrow=c(1,1))
test_values <- left_join(test,
select(train_locf, meter_id, Timestamp, Values),
by=c("meter_id", "Timestamp"))
test_values$is_abnormal <- FALSE
for (i in 1:length(test_meters)) {
meter_median <- median(test_values$Values[test_values$meter_id==test_meters[i]])
test_values$is_abnormal[test_values$meter_id==test_meters[i] & test_values$Values>meter_median ] <- TRUE
}
test_values$is_abnormal[is.na(test_values$Values)] <- FALSE
for (i in 1:length(test_meters)) {
plot(select(filter(test_values, meter_id==test_meters[i] & is_abnormal==TRUE),
Timestamp, Values),
main="Above Median",
col="red",
ylim=c(min(test_values$Values[test_values$meter_id==test_meters[i]]),
max(test_values$Values[test_values$meter_id==test_meters[i]])))
points(select(filter(test_values, meter_id==test_meters[i] & is_abnormal==FALSE),
Timestamp, Values),
col="dark green")
}
write_csv(select(test_values, -Values), "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_above_median.csv")
test_values$is_abnormal <- !test_values$is_abnormal
write_csv(select(test_values, -Values), "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_below_median.csv")
# https://github.com/twitter/AnomalyDetection
# the documentation doesn't mention anything about NAs, so I'll use the locf-filled data that was also used for the delta analysis
rm(list=ls(pattern="test_")) # remove previous predictions
test_meters <- levels(as.factor(test$meter_id))
train_locf <- as.data.frame(select(train_locf, meter_id, Timestamp, Values, Values_cs)) # as.data.frame to avoid tibble warnings
train_locf$is_abnormal <- FALSE
ADout <- NULL
for(i in 1:length(test_meters)) {
# Need to determine the sampling interval -- https://github.com/twitter/AnomalyDetection/issues/15
timediff <- rbind(select(filter(train_locf, meter_id==test_meters[i]), Timestamp), NA) -
rbind(NA, select(filter(train_locf, meter_id==test_meters[i]), Timestamp))
summary(as.numeric(unlist(timediff))) # Are gaps OK?
granularity <- median(as.numeric(unlist(timediff)), na.rm=TRUE)
# if a period is a day and the timediffs are in minutes:
period <- (24*60)/granularity
ADout[i] <- list(AnomalyDetectionVec(select(filter(train_locf, meter_id==test_meters[i]), Values),
direction="both",
max_anoms=0.33,
alpha=0.05,
period=period,
longterm_period=7*period,
plot=TRUE,
title=test_meters[i]))
ADout[[i]][2]
anom_rows <- unlist(ADout[[i]][[1]][1])
labeled <- select(filter(train_locf, meter_id==test_meters[i]), is_abnormal)
labeled$index <- seq(from=1, to=length(labeled$is_abnormal))
labeled$is_abnormal[labeled$index %in% anom_rows] <- TRUE
sum(labeled$is_abnormal)/length(labeled$is_abnormal) # does this match the "% Anomalies" in the plot?
train_locf$is_abnormal[train_locf$meter_id==test_meters[i]] <- labeled$is_abnormal
}
test_AD <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal),
by=c("meter_id", "Timestamp"))
sum(test_AD$is_abnormal)
write_csv(test_AD, "DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_AD_dirBoth_max330_alpha050_lperiod7.csv")
test_AD <- left_join(select(test, -is_abnormal),
select(train_locf, meter_id, Timestamp, is_abnormal, Values),
by=c("meter_id", "Timestamp"))
for (i in 1:length(test_meters)) {
plot(select(filter(test_AD, meter_id==test_meters[i] & is_abnormal==FALSE), Timestamp, Values),
main="AnomalyDetection",
col="dark green")
points(select(filter(test_AD, meter_id==test_meters[i] & is_abnormal==TRUE), Timestamp, Values),
col="red")
}
# first is week days and work days
train <- mutate(train, weekday=weekdays(Timestamp), month=months(Timestamp), year=format(Timestamp, "%Y"))
train$workday <- TRUE
train[train$weekday %in% c("Saturday", "Sunday"), "workday"] <- FALSE
print(paste("5/7 =", 5/7))
# the holidays data is only provided for two sites:
unique(holidays$site_id)
# and they're not the same between sites
holidays$foo <- 1
plot(select(filter(holidays, site_id==unique(holidays$site_id)[1]), Date, foo),
main="Holidays By Site",
ylab="",
col="darkblue")
points(select(filter(holidays, site_id==unique(holidays$site_id)[2]), Date, foo), col="orange", pch=4)
holidays <- select(holidays, -foo)
train$date <- as.Date(trunc(train$Timestamp, "day"))
train$hour <- as.POSIXlt(train$Timestamp)$hour
for(i in 1:length(test_meters)) {
test_site <- head(train$site_id[train$meter_id==test_meters[i]], 1)
print(paste("meter_id:", test_meters[i], "site_id:", test_site))
print(paste("Ratio of workdays (before holidays):",
nrow(unique(select(filter(train, workday==TRUE, meter_id==test_meters[i]), date)))/
nrow(unique(select(filter(train, meter_id==test_meters[i]), date)))))
if(head(select(filter(train, meter_id==test_meters[i]), site_id), 1) %in% holidays$site_id) {
print(paste("Marking",
length(train$workday[train$date %in% holidays$Date[holidays$site_id==test_meters[i]]]),
"holidays as non-workdays"))
train$workday[train$date %in% holidays$Date[holidays$site_id==test_meters[i]]] <- FALSE
}
print(paste("Ratio of workdays (after holidays):",
nrow(unique(select(filter(train, workday==TRUE, meter_id==test_meters[i]), date)))/
nrow(unique(select(filter(train, meter_id==test_meters[i]), date)))))
}
# high and low daily temperatures near each site
# the weather data doesn't span the train/test data:
summary(weather)
summary(test)
weather$date <- as.Date(trunc(weather$Timestamp, "day"))
weather_daily <- summarise(group_by(weather, date, site_id, Distance),
dailyMin=min(Temperature),
dailyMax=max(Temperature),
dailyMean=mean(Temperature))
weather_daily_single <- summarise(group_by(weather_daily, date, site_id), Distance=min(Distance))
weather_daily_single <- left_join(weather_daily_single, weather_daily,
by=c("date", "site_id", "Distance"))
for (i in 1:length(test_meters)) {
test_site <- head(train$site_id[train$meter_id==test_meters[i]], 1)
plot(select(filter(weather_daily_single, site_id==test_site), date, dailyMean),
main=paste("Average Daily Temperature for", test_site))
}
train <- left_join(train, select(weather_daily_single, -Distance),
by=c("date", "site_id"))
# randomForest doesn't handle NAs. I'll fill the meter reaings as I did before. Then, since the weather
# data doesn't span the training data, I'll clip the training data at the extents of the weather data.
rm(list=ls(pattern="test_"))
rm(list=ls(pattern="train_locf"))
test_meters <- levels(as.factor(test$meter_id))
# locf/nocb on values
train_locf <- filter(train, meter_id %in% test_meters)
for (i in 1:length(test_meters)) {
print(paste("meter_id: ", test_meters[i]))
print(paste("NAs:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- na.locf(object=
train_locf$Values[train_locf$meter_id==test_meters[i]],
fromLast=FALSE,
na.rm=FALSE)
print(paste("NAs after LOCF:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- na.locf(object=
train_locf$Values[train_locf$meter_id==test_meters[i]],
fromLast=TRUE,
na.rm=FALSE)
print(paste("NAs after NOCB:", sum(is.na(train_locf$Values[train_locf$meter_id==test_meters[i]]))))
}
# cumulative meters
train_locf$Values_cs <- NA
for(i in 1:length(test_meters)) {
values <- unlist(select(filter(train_locf, meter_id==test_meters[i]), Values))
diffs <- c(0, diff(values, lag=1))
if(sum(diffs[-1] >= 0)/length(diffs[-1]) > 0.99) {
# if the deltas are >>50% positive (or zero), then it's probably cumulative
train_locf$Values_cs[train_locf$meter_id==test_meters[i]] <- train_locf$Values[train_locf$meter_id==test_meters[i]]
if(any(diffs<0)) {
# if the deltas are <100% postive (or zero), then it probably wraps-around
wraps <- which(diffs<0)
limit <- 10^round(log10(max(values)))
print(values[wraps])
for(w in 1:length(wraps)) {
print(paste("Correcting wrap", w, "onwards"))
values[wraps[w]:length(values)] <- values[wraps[w]:length(values)] + limit
print(values[wraps])
}
values <- c(0, diff(values, lag=1))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- values
diffs <- c(0, diff(values, lag=1))
} else {
values <- c(0, diff(values, lag=1))
train_locf$Values[train_locf$meter_id==test_meters[i]] <- values
diffs <- c(0, diff(values, lag=1))
}
} else {
train_locf$Values_cs[train_locf$meter_id==test_meters[i]] <-
cumsum(train_locf$Values[train_locf$meter_id==test_meters[i]])
}
}
# crop to weather data
# TODO extrapolate weather data
print(paste("Number of training rows:", nrow(train_locf)))
print(paste("Weather data runs from", min(weather_daily_single$date), "to", max(weather_daily_single$date)))
for (i in 1:length(test_meters)) {
print(paste("Meter", test_meters[i], "data runs from", min(train_locf$date[train_locf$meter_id==test_meters[i]]),
"to", max(train_locf$date[train_locf$meter_id==test_meters[i]])))
train_locf <- filter(train_locf, meter_id %in% test_meters[-i] |
((meter_id==test_meters[i] & date >= min(weather_daily_single$date)) &
(meter_id==test_meters[i] & date <= max(weather_daily_single$date))))
print(paste("Number of training rows:", nrow(train_locf)))
}
# when you only have a hammer
# TODO where are these NAs coming from?
train_locf <- select(train_locf, -surface)
train_locf <- na.omit(train_locf)
print(paste("Number of training rows:", nrow(train_locf)))
set.seed(1)
train_rf <- select(train_locf, meter_id, Timestamp, Values,
weekday, month, year, workday, hour,
dailyMin, dailyMax, dailyMean)
train_rf$weekday <- as.factor(train_rf$weekday)
train_rf$month <- as.factor(train_rf$month)
train_rf$year <- as.factor(train_rf$year)
train_rf$workday <- as.factor(train_rf$workday)
#stop() # several hours worth of model building below
#rf1
model_filename <- "DrivenData_PowerLaws_DetectingAnomaliesInUsage/rf1.RData"
if(file.exists(model_filename)) {
print(paste("Loading model from", model_filename))
load(model_filename)
} else {
system.time(rf1 <- randomForest(Values~.-meter_id,
data=select(filter(train_rf, meter_id==test_meters[1]), -Timestamp),
importance=TRUE))
rf1
importance(rf1)
save(rf1, file=model_filename)
}
#rf2
model_filename <- "DrivenData_PowerLaws_DetectingAnomaliesInUsage/rf2.RData"
if(file.exists(model_filename)) {
print(paste("Loading model from", model_filename))
load(model_filename)
} else {
system.time(rf2 <- randomForest(Values~.-meter_id,
data=select(filter(train_rf, meter_id==test_meters[2]), -Timestamp),
importance=TRUE))
rf2
importance(rf2)
save(rf2, file=model_filename)
}
#rf3
model_filename <- "DrivenData_PowerLaws_DetectingAnomaliesInUsage/rf3.RData"
if(file.exists(model_filename)) {
print(paste("Loading model from", model_filename))
load(model_filename)
} else {
system.time(rf3 <- randomForest(Values~.-meter_id,
data=select(filter(train_rf, meter_id==test_meters[3]), -Timestamp),
importance=TRUE))
rf3
importance(rf3)
save(rf3, file=model_filename)
}
# TODO: derive/columns as we did for train, not from train directly
test_rf <- left_join(select(test, meter_id, Timestamp),
train_rf,
by=c("meter_id", "Timestamp"))
test_rf <- na.omit(test_rf)
yhat_rf <- predict(rf1, newdata=select(filter(test_rf, meter_id==test_meters[1]), -Timestamp, -Values))
yhat_rf <- c(yhat_rf, predict(rf2, newdata=select(filter(test_rf, meter_id==test_meters[2]), -Timestamp, -Values)))
yhat_rf <- c(yhat_rf, predict(rf3, newdata=select(filter(test_rf, meter_id==test_meters[3]), -Timestamp, -Values)))
yhat_rf <- cbind(select(test_rf, meter_id, Timestamp, Values), yhat_rf)
for (i in 1:length(test_meters)) {
plot(x=yhat_rf$Values[yhat_rf$meter_id==test_meters[i]],
y=yhat_rf$yhat_rf[yhat_rf$meter_id==test_meters[i]],
main=paste("Predicted vs. Actual", test_meters[i]),
xlab="Actual Energy Usage",
ylab="Predicted Energy Usage")
}
yhat_rf$diff <- yhat_rf$Values - yhat_rf$yhat_rf
yhat_rf$diff_percent <- yhat_rf$diff/yhat_rf$Values
yhat_rf$diff_percent[yhat_rf$Values==0] <- 0
yhat_rf$is_abnormal <- FALSE
percent_diff_cutoff <- 0.50
yhat_rf$is_abnormal[abs(yhat_rf$diff_percent)>percent_diff_cutoff] <- TRUE
print(paste("Percentage of records marked anomalous:", sum(yhat_rf$is_abnormal)/nrow(yhat_rf)))
yhat_rf <- left_join(select(test, -is_abnormal),
select(yhat_rf, meter_id, Timestamp, Values, yhat_rf, is_abnormal),
by=c("meter_id", "Timestamp"))
yhat_rf$is_abnormal[is.na(yhat_rf$is_abnormal)] <- FALSE
for (i in 1:length(test_meters)) {
plot(select(filter(yhat_rf, meter_id==test_meters[i] & is_abnormal==FALSE), Timestamp, Values),
main=paste("randomForest, Site", test_meters[i]),
col="dark green")
points(select(filter(yhat_rf, meter_id==test_meters[i] & is_abnormal==TRUE), Timestamp, Values),
col="red")
}
write_csv(select(yhat_rf, obs_id, meter_id, Timestamp, is_abnormal),
"DrivenData_PowerLaws_DetectingAnomaliesInUsage/predict_RF_050simplePercentDiff.csv")
## References
#* Casella, G., Fienberg, S., Olkin, I., James, G., Witten, D., Hastie, T., & Tibshirani, R. (2014). An Introduction to Statistical Learning. Design (Vol. 102). http://doi.org/10.1016/j.peva.2007.06.006
#* Algorithms for Time Series Anomaly Detection. (2017). CrossValidated. Retrieved from https://stats.stackexchange.com/questions/137094/algorithms-for-time-series-anomaly-detection.
#* AnomalyDetection. (2015). Twitter. Retrieved from https://github.com/twitter/AnomalyDetection.
#* Power Laws: Detecting Anomalies in Usage. (2018). DrivenData Competition. Retrieved from https://www.drivendata.org/competitions/52/anomaly-detection-electricity/page/102/.
#* tsoutliers: Detection of Outliers in Time Series. (2017). CRAN. Retrieved from https://cran.r-project.org/package=tsoutliers.