options("repos" = c(CRAN = "http://cran.r-project.org/")) install.packages('dplyr') install.packages('stringr') install.packages('lubridate') install.packages('rgeos') # spatial package install.packages('sp') # spatial package install.packages('maptools') # spatial package install.packages('ggmap') install.packages('ggplot2') install.packages('gridExtra') # for putting plots side by side install.packages('ggrepel') # avoid text overlap in plots install.packages('tidyr') install.packages('seriation') # package for reordering a distance matrix ##################################################################### # 1.1d loading packages ##################################################################### setwd('C:/DAT213x/script') options(max.print = 1000, scipen = 999, width = 90) library(RevoScaleR) rxOptions(reportProgress = 1) # reduces the amount of output RevoScaleR produces library(dplyr) options(dplyr.print_max = 2000) options(dplyr.width = Inf) # shows all columns of a tbl_df object library(stringr) library(lubridate) library(rgeos) # spatial package library(sp) # spatial package library(maptools) # spatial package library(ggmap) library(ggplot2) library(gridExtra) # for putting plots side by side library(ggrepel) # avoid text overlap in plots library(tidyr) library(seriation) # package for reordering a distance matrix ##################################################################### # 2.1a loading top 1000 rows ##################################################################### col_classes <- c('VendorID' = "factor", 'tpep_pickup_datetime' = "character", 'tpep_dropoff_datetime' = "character", 'passenger_count' = "integer", 'trip_distance' = "numeric", 'pickup_longitude' = "numeric", 'pickup_latitude' = "numeric", 'RateCodeID' = "factor", 'store_and_fwd_flag' = "factor", 'dropoff_longitude' = "numeric", 'dropoff_latitude' = "numeric", 'payment_type' = "factor", 'fare_amount' = "numeric", 'extra' = "numeric", 'mta_tax' = "numeric", 'tip_amount' = "numeric", 'tolls_amount' = "numeric", 'improvement_surcharge' = "numeric", 'total_amount' = "numeric", 'u' = "numeric") input_csv <- 'yellow_tripsample_2016-01.csv' # we take a chunk of the data and load it as a data.frame (good for testing things) nyc_sample_df <- read.csv(input_csv, nrows = 1000, colClasses = col_classes) head(nyc_sample_df, 10) ##################################################################### # 2.1b reading the whole data ##################################################################### input_xdf <- 'yellow_tripdata_2016.xdf' st <- Sys.time() rxImport(input_csv, input_xdf, colClasses = col_classes, overwrite = TRUE) print(input_csv) for(ii in 2:6) { # get each month's data and append it to the first month's data input_csv <- sprintf('yellow_tripsample_2016-%02d.csv', ii) rxImport(input_csv, input_xdf, colClasses = col_classes, overwrite = TRUE, append = "rows") print(input_csv) } Sys.time() - st # stores the time it took to import ##################################################################### # 2.1c XDF vs CSV ##################################################################### input_xdf <- 'yellow_tripdata_2016.xdf' nyc_xdf <- RxXdfData(input_xdf) system.time( rxsum_xdf <- rxSummary( ~ fare_amount, nyc_xdf) # provide statistical summaries for fare amount ) rxsum_xdf input_csv <- 'yellow_tripsample_2016-01.csv' # we can only use one month's data unless we join the CSVs nyc_csv <- RxTextData(input_csv, colClasses = col_classes) # point to CSV file and provide column info system.time( rxsum_csv <- rxSummary( ~ fare_amount, nyc_csv) # provide statistical summaries for fare amount ) rxsum_csv ##################################################################### # 2.2a checking column types ##################################################################### rxGetInfo(nyc_xdf, getVarInfo = TRUE, numRows = 10) # show column types and the first 10 rows ##################################################################### # 2.2b simple transformation ##################################################################### rxDataStep(nyc_xdf, nyc_xdf, transforms = list(tip_percent = ifelse(fare_amount > 0 & tip_amount < fare_amount, round(tip_amount * 100 / fare_amount, 0), NA)), overwrite = TRUE) rxSummary( ~ tip_percent, nyc_xdf) ##################################################################### # 2.2c complex transformations ##################################################################### xforms <- function(data) { # transformation function for extracting some date and time features weekday_labels <- c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat') cut_levels <- c(1, 5, 9, 12, 16, 18, 22) hour_labels <- c('1AM-5AM', '5AM-9AM', '9AM-12PM', '12PM-4PM', '4PM-6PM', '6PM-10PM', '10PM-1AM') pickup_datetime <- ymd_hms(data$tpep_pickup_datetime, tz = "UTC") pickup_hour <- addNA(cut(hour(pickup_datetime), cut_levels)) pickup_dow <- factor(wday(pickup_datetime), levels = 1:7, labels = weekday_labels) levels(pickup_hour) <- hour_labels dropoff_datetime <- ymd_hms(data$tpep_dropoff_datetime, tz = "UTC") dropoff_hour <- addNA(cut(hour(dropoff_datetime), cut_levels)) dropoff_dow <- factor(wday(dropoff_datetime), levels = 1:7, labels = weekday_labels) levels(dropoff_hour) <- hour_labels data$pickup_hour <- pickup_hour data$pickup_dow <- pickup_dow data$dropoff_hour <- dropoff_hour data$dropoff_dow <- dropoff_dow data$trip_duration <- as.integer(as.duration(dropoff_datetime - pickup_datetime)) data } library(lubridate) Sys.setenv(TZ = "US/Eastern") # not important for this dataset head(xforms(nyc_sample_df)) # test the function on a data.frame head(rxDataStep(nyc_sample_df, transformFunc = xforms, transformPackages = "lubridate")) st <- Sys.time() rxDataStep(nyc_xdf, nyc_xdf, overwrite = TRUE, transformFunc = xforms, transformPackages = "lubridate") Sys.time() - st ##################################################################### # 2.2d examining new columns ##################################################################### rxs1 <- rxSummary( ~ pickup_hour + pickup_dow + trip_duration, nyc_xdf) # we can add a column for proportions next to the counts rxs1$categorical <- lapply(rxs1$categorical, function(x) cbind(x, prop = round(prop.table(x$Counts), 2))) rxs1 rxs2 <- rxSummary( ~ pickup_dow:pickup_hour, nyc_xdf) rxs2 <- tidyr::spread(rxs2$categorical[[1]], key = 'pickup_hour', value = 'Counts') row.names(rxs2) <- rxs2[ , 1] rxs2 <- as.matrix(rxs2[ , -1]) rxs2 levelplot(prop.table(rxs2, 2), cuts = 20, xlab = "", ylab = "", main = "Distribution of taxis by day of week") ##################################################################### # 2.2e plotting neighborhoods ##################################################################### library(rgeos) library(sp) library(maptools) nyc_shapefile <- readShapePoly('ZillowNeighborhoods-NY/ZillowNeighborhoods-NY.shp') mht_shapefile <- subset(nyc_shapefile, str_detect(CITY, 'New York City-Manhattan')) mht_shapefile@data$id <- as.character(mht_shapefile@data$NAME) mht.points <- fortify(gBuffer(mht_shapefile, byid = TRUE, width = 0), region = "NAME") mht.df <- inner_join(mht.points, mht_shapefile@data, by = "id") library(dplyr) mht.cent <- mht.df %>% group_by(id) %>% summarize(long = median(long), lat = median(lat)) library(ggrepel) ggplot(mht.df, aes(long, lat, fill = id)) + geom_polygon() + geom_path(color = "white") + coord_equal() + theme(legend.position = "none") + geom_text_repel(aes(label = id), data = mht.cent, size = 2) ##################################################################### # 2.2f adding neighborhoods ##################################################################### # take only the coordinate columns, and replace NAs with 0 data_coords <- transmute(nyc_sample_df, long = ifelse(is.na(pickup_longitude), 0, pickup_longitude), lat = ifelse(is.na(pickup_latitude), 0, pickup_latitude) ) # we specify the columns that correspond to the coordinates coordinates(data_coords) <- c('long', 'lat') # returns the neighborhoods based on coordinates nhoods <- over(data_coords, mht_shapefile) # rename the column names in nhoods names(nhoods) <- paste('pickup', tolower(names(nhoods)), sep = '_') # combine the neighborhood information with the original data nyc_sample_df <- cbind(nyc_sample_df, nhoods[, grep('name|city', names(nhoods))]) head(nyc_sample_df) find_nhoods <- function(data) { # extract pick-up lat and long and find their neighborhoods pickup_longitude <- ifelse(is.na(data$pickup_longitude), 0, data$pickup_longitude) pickup_latitude <- ifelse(is.na(data$pickup_latitude), 0, data$pickup_latitude) data_coords <- data.frame(long = pickup_longitude, lat = pickup_latitude) coordinates(data_coords) <- c('long', 'lat') nhoods <- over(data_coords, shapefile) ## add only the pick-up neighborhood and city columns to the data data$pickup_nhood <- nhoods$NAME data$pickup_borough <- nhoods$CITY # extract drop-off lat and long and find their neighborhoods dropoff_longitude <- ifelse(is.na(data$dropoff_longitude), 0, data$dropoff_longitude) dropoff_latitude <- ifelse(is.na(data$dropoff_latitude), 0, data$dropoff_latitude) data_coords <- data.frame(long = dropoff_longitude, lat = dropoff_latitude) coordinates(data_coords) <- c('long', 'lat') nhoods <- over(data_coords, shapefile) ## add only the drop-off neighborhood and city columns to the data data$dropoff_nhood <- nhoods$NAME data$dropoff_borough <- nhoods$CITY ## return the data with the new columns added in data } # test the function on a data.frame using rxDataStep head(rxDataStep(nyc_sample_df, transformFunc = find_nhoods, transformPackages = c("sp", "maptools"), transformObjects = list(shapefile = mht_shapefile))) st <- Sys.time() rxDataStep(nyc_xdf, nyc_xdf, overwrite = TRUE, transformFunc = find_nhoods, transformPackages = c("sp", "maptools", "rgeos"), transformObjects = list(shapefile = mht_shapefile)) Sys.time() - st rxGetInfo(nyc_xdf, numRows = 5) ##################################################################### # 3.1a examining the neighborhoods ##################################################################### system.time( rxs_all <- rxSummary( ~ ., nyc_xdf) ) head(rxs_all$sDataFrame) nhoods_by_borough <- rxCrossTabs( ~ pickup_nhood:pickup_borough, nyc_xdf) nhoods_by_borough <- nhoods_by_borough$counts[[1]] nhoods_by_borough <- as.data.frame(nhoods_by_borough) # get the neighborhoods by borough lnbs <- lapply(names(nhoods_by_borough), function(vv) subset(nhoods_by_borough, nhoods_by_borough[ , vv] > 0, select = vv, drop = FALSE)) lapply(lnbs, head) ##################################################################### # 3.1b focusing on Manhattan ##################################################################### manhattan_nhoods <- rownames(nhoods_by_borough)[nhoods_by_borough$`New York City-Manhattan` > 0] refactor_columns <- function(dataList) { dataList$pickup_nb = factor(dataList$pickup_nhood, levels = nhoods_levels) dataList$dropoff_nb = factor(dataList$dropoff_nhood, levels = nhoods_levels) dataList } rxDataStep(nyc_xdf, nyc_xdf, transformFunc = refactor_columns, transformObjects = list(nhoods_levels = manhattan_nhoods), overwrite = TRUE) rxs_pickdrop <- rxSummary( ~ pickup_nb:dropoff_nb, nyc_xdf) head(rxs_pickdrop$categorical[[1]]) ##################################################################### # 3.1c examining trip distance ##################################################################### rxHistogram( ~ trip_distance, nyc_xdf, startVal = 0, endVal = 25, histType = "Percent", numBreaks = 20) rxs <- rxSummary( ~ pickup_nhood:dropoff_nhood, nyc_xdf, rowSelection = (trip_distance > 15 & trip_distance < 22)) head(arrange(rxs$categorical[[1]], desc(Counts)), 10) ##################################################################### # 3.1d examining outliers ##################################################################### # outFile argument missing means we output to data.frame odd_trips <- rxDataStep(nyc_xdf, rowSelection = ( u < .05 & ( # we can adjust this if the data gets too big (trip_distance > 50 | trip_distance <= 0) | (passenger_count > 5 | passenger_count == 0) | (fare_amount > 5000 | fare_amount <= 0) )), transforms = list(u = runif(.rxNumRows))) print(dim(odd_trips)) odd_trips %>% filter(trip_distance > 50) %>% ggplot() -> p p + geom_histogram(aes(x = fare_amount, fill = trip_duration <= 10*60), binwidth = 10) + xlim(0, 500) + coord_fixed(ratio = 25) ##################################################################### # 3.1e filtering by Manhattan ##################################################################### input_xdf <- 'yellow_tripdata_2016_manhattan.xdf' mht_xdf <- RxXdfData(input_xdf) rxDataStep(nyc_xdf, mht_xdf, rowSelection = ( passenger_count > 0 & trip_distance >= 0 & trip_distance < 30 & trip_duration > 0 & trip_duration < 60*60*24 & str_detect(pickup_borough, 'Manhattan') & str_detect(dropoff_borough, 'Manhattan') & !is.na(pickup_nb) & !is.na(dropoff_nb) & fare_amount > 0), transformPackages = "stringr", varsToDrop = c('extra', 'mta_tax', 'improvement_surcharge', 'total_amount', 'pickup_borough', 'dropoff_borough', 'pickup_nhood', 'dropoff_nhood'), overwrite = TRUE) mht_sample_df <- rxDataStep(mht_xdf, rowSelection = (u < .01), transforms = list(u = runif(.rxNumRows))) dim(mht_sample_df) ##################################################################### # 3.2a reordering neighborhoods ##################################################################### rxct <- rxCrossTabs(trip_distance ~ pickup_nb:dropoff_nb, mht_xdf) res <- rxct$sums$trip_distance / rxct$counts$trip_distance library(seriation) res[which(is.nan(res))] <- mean(res, na.rm = TRUE) nb_order <- seriate(res) rxc1 <- rxCube(trip_distance ~ pickup_nb:dropoff_nb, mht_xdf) rxc2 <- rxCube(minutes_per_mile ~ pickup_nb:dropoff_nb, mht_xdf, transforms = list(minutes_per_mile = (trip_duration / 60) / trip_distance)) rxc3 <- rxCube(tip_percent ~ pickup_nb:dropoff_nb, mht_xdf) res <- bind_cols(list(rxc1, rxc2, rxc3)) res <- res[, c('pickup_nb', 'dropoff_nb', 'trip_distance', 'minutes_per_mile', 'tip_percent')] head(res) library(ggplot2) ggplot(res, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = trip_distance), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "steelblue") + coord_fixed(ratio = .9) newlevs <- levels(res$pickup_nb)[unlist(nb_order)] res$pickup_nb <- factor(res$pickup_nb, levels = unique(newlevs)) res$dropoff_nb <- factor(res$dropoff_nb, levels = unique(newlevs)) library(ggplot2) ggplot(res, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = trip_distance), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "steelblue") + coord_fixed(ratio = .9) ##################################################################### # 3.2b neighborhood trends ##################################################################### ggplot(res, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = minutes_per_mile), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "steelblue") + coord_fixed(ratio = .9) res %>% mutate(tip_color = cut(tip_percent, c(0, 5, 8, 10, 12, 100))) %>% ggplot(aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = tip_color)) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + coord_fixed(ratio = .9) ##################################################################### # 3.2c refactoring neighborhoods ##################################################################### # first way of reordering the factor levels rxDataStep(inData = mht_xdf, outFile = mht_xdf, transforms = list(pickup_nb = factor(pickup_nb, levels = newlevels), dropoff_nb = factor(dropoff_nb, levels = newlevels)), transformObjects = list(newlevels = unique(newlevs)), overwrite = TRUE) # second way of reordering the factor levels rxFactors(mht_xdf, outFile = mht_xdf, factorInfo = list(pickup_nb = list(newLevels = unique(newlevs)), dropoff_nb = list(newLevels = unique(newlevs))), overwrite = TRUE) ##################################################################### # 3.2d trip distribution across neighborhoods ##################################################################### rxc <- rxCube( ~ pickup_nb:dropoff_nb, mht_xdf) rxc <- as.data.frame(rxc) library(dplyr) rxc %>% filter(Counts > 0) %>% mutate(pct_all = Counts / sum(Counts) * 100) %>% group_by(pickup_nb) %>% mutate(pct_by_pickup_nb = Counts / sum(Counts) * 100) %>% group_by(dropoff_nb) %>% mutate(pct_by_dropoff_nb = Counts / sum(Counts) * 100) %>% group_by() %>% arrange(desc(Counts)) -> rxcs head(rxcs) ggplot(rxcs, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = pct_all), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "black") + coord_fixed(ratio = .9) ggplot(rxcs, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = pct_by_pickup_nb), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "steelblue") + coord_fixed(ratio = .9) ggplot(rxcs, aes(pickup_nb, dropoff_nb)) + geom_tile(aes(fill = pct_by_dropoff_nb), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "red") + coord_fixed(ratio = .9) ##################################################################### # 3.2f time-related patterns ##################################################################### res1 <- rxCube(tip_percent ~ pickup_dow:pickup_hour, mht_xdf) res2 <- rxCube(fare_amount / (trip_duration / 60) ~ pickup_dow:pickup_hour, mht_xdf) names(res2)[3] <- 'fare_per_minute' res <- bind_cols(list(res1, res2)) res <- res[, c('pickup_dow', 'pickup_hour', 'fare_per_minute', 'tip_percent', 'Counts')] library(ggplot2) ggplot(res, aes(pickup_dow, pickup_hour)) + geom_tile(aes(fill = fare_per_minute), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "steelblue") + geom_text(aes(label = sprintf('%dK riders\n (%d%% tip)', signif(Counts / 1000, 2), round(tip_percent, 0))), size = 2.5) + coord_fixed(ratio = .9) ##################################################################### # 4.1a looking at maps ##################################################################### library(ggmap) map_13 <- get_map(location = c(lon = -73.98, lat = 40.76), zoom = 13) map_14 <- get_map(location = c(lon = -73.98, lat = 40.76), zoom = 14) map_15 <- get_map(location = c(lon = -73.98, lat = 40.76), zoom = 15) q1 <- ggmap(map_14) + geom_point(aes(x = dropoff_longitude, y = dropoff_latitude), data = mht_sample_df, alpha = 0.15, na.rm = TRUE, col = "red", size = .5) + theme_nothing(legend = TRUE) q2 <- ggmap(map_15) + geom_point(aes(x = dropoff_longitude, y = dropoff_latitude), data = mht_sample_df, alpha = 0.15, na.rm = TRUE, col = "red", size = .5) + theme_nothing(legend = TRUE) require(gridExtra) grid.arrange(q1, q2, ncol = 2) ##################################################################### # 4.1b creating clusters ##################################################################### xydata <- transmute(mht_sample_df, long_std = dropoff_longitude / -74, lat_std = dropoff_latitude / 40) start_time <- Sys.time() rxkm_sample <- kmeans(xydata, centers = 300, iter.max = 2000, nstart = 50) Sys.time() - start_time # we need to put the centroids back into the original scale for coordinates centroids_sample <- rxkm_sample$centers %>% as.data.frame %>% transmute(long = long_std*(-74), lat = lat_std*40, size = rxkm_sample$size) head(centroids_sample) start_time <- Sys.time() rxkm <- rxKmeans( ~ long_std + lat_std, data = mht_xdf, outFile = mht_xdf, outColName = "dropoff_cluster", centers = rxkm_sample$centers, transforms = list(long_std = dropoff_longitude / -74, lat_std = dropoff_latitude / 40), blocksPerRead = 1, overwrite = TRUE, # need to set this when writing to same file maxIterations = 500, reportProgress = -1) Sys.time() - start_time clsdf <- cbind( transmute(as.data.frame(rxkm$centers), long = long_std*(-74), lat = lat_std*40), size = rxkm$size, withinss = rxkm$withinss) head(clsdf) centroids_whole <- cbind( transmute(as.data.frame(rxkm$centers), long = long_std*(-74), lat = lat_std*40), size = rxkm$size, withinss = rxkm$withinss) q1 <- ggmap(map_15) + geom_point(data = centroids_sample, aes(x = long, y = lat, alpha = size), na.rm = TRUE, size = 1, col = 'red') + theme_nothing(legend = TRUE) + labs(title = "centroids using sample data") q2 <- ggmap(map_15) + geom_point(data = centroids_whole, aes(x = long, y = lat, alpha = size), na.rm = TRUE, size = 1, col = 'red') + theme_nothing(legend = TRUE) + labs(title = "centroids using whole data") require(gridExtra) grid.arrange(q1, q2, ncol = 2) ##################################################################### # 4.2a linear model predicting tip percent ##################################################################### form_1 <- as.formula(tip_percent ~ pickup_nb:dropoff_nb + pickup_dow:pickup_hour) rxlm_1 <- rxLinMod(form_1, data = mht_xdf, dropFirst = TRUE, covCoef = TRUE) rxs <- rxSummary( ~ pickup_nb + dropoff_nb + pickup_hour + pickup_dow, mht_xdf) ll <- lapply(rxs$categorical, function(x) x[ , 1]) names(ll) <- c('pickup_nb', 'dropoff_nb', 'pickup_hour', 'pickup_dow') pred_df_1 <- expand.grid(ll) pred_df_1 <- rxPredict(rxlm_1, data = pred_df_1, computeStdErrors = TRUE, writeModelVars = TRUE) names(pred_df_1)[1:2] <- paste(c('tip_pred', 'tip_stderr'), 1, sep = "_") head(pred_df_1, 10) ##################################################################### # 4.2b examining predictions ##################################################################### ggplot(pred_df_1, aes(x = pickup_nb, y = dropoff_nb)) + geom_tile(aes(fill = tip_pred_1), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "red") + coord_fixed(ratio = .9) ggplot(pred_df_1, aes(x = pickup_dow, y = pickup_hour)) + geom_tile(aes(fill = tip_pred_1), colour = "white") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_fill_gradient(low = "white", high = "red") + coord_fixed(ratio = .9) ##################################################################### # 4.2c choosing between models ##################################################################### form_2 <- as.formula(tip_percent ~ pickup_nb:dropoff_nb) rxlm_2 <- rxLinMod(form_2, data = mht_xdf, dropFirst = TRUE, covCoef = TRUE) pred_df_2 <- rxPredict(rxlm_2, data = pred_df_1, computeStdErrors = TRUE, writeModelVars = TRUE) names(pred_df_2)[1:2] <- paste(c('tip_pred', 'tip_stderr'), 2, sep = "_") pred_df <- pred_df_2 %>% select(starts_with('tip_')) %>% cbind(pred_df_1) %>% arrange(pickup_nb, dropoff_nb, pickup_dow, pickup_hour) %>% select(pickup_dow, pickup_hour, pickup_nb, dropoff_nb, starts_with('tip_pred_')) head(pred_df) ggplot(data = pred_df) + geom_density(aes(x = tip_pred_1, col = "complex")) + geom_density(aes(x = tip_pred_2, col = "simple")) + facet_grid(pickup_hour ~ pickup_dow) + xlab('tip percent prediction') rxQuantile("tip_percent", data = mht_xdf, probs = seq(0, 1, by = .05)) pred_df %>% mutate_at(vars(tip_pred_1, tip_pred_2), funs(cut(., c(-Inf, 8, 12, 15, 18, Inf)))) %>% ggplot() + geom_bar(aes(x = tip_pred_1, fill = "complex", alpha = .5)) + geom_bar(aes(x = tip_pred_2, fill = "simple", alpha = .5)) + facet_grid(pickup_hour ~ pickup_dow) + xlab('tip percent prediction') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ##################################################################### # 4.2e using other algorithms ##################################################################### dir.create('output', showWarnings = FALSE) rx_split_xdf <- function(xdf = mht_xdf, split_perc = 0.75, output_path = "output/split", ...) { # first create a column to split by outFile <- tempfile(fileext = 'xdf') rxDataStep(inData = xdf, outFile = xdf, transforms = list( split = factor(ifelse(rbinom(.rxNumRows, size = 1, prob = splitperc), "train", "test"))), transformObjects = list(splitperc = split_perc), overwrite = TRUE, ...) # then split the data in two based on the column we just created splitDS <- rxSplit(inData = xdf, outFilesBase = file.path(output_path, "train"), splitByFactor = "split", overwrite = TRUE) return(splitDS) } # we can now split to data in two mht_split <- rx_split_xdf(xdf = mht_xdf, varsToKeep = c('payment_type', 'fare_amount', 'tip_amount', 'tip_percent', 'pickup_hour', 'pickup_dow', 'pickup_nb', 'dropoff_nb')) names(mht_split) <- c("train", "test") system.time(linmod <- rxLinMod(tip_percent ~ pickup_nb:dropoff_nb + pickup_dow:pickup_hour, data = mht_split$train, reportProgress = 0)) system.time(dtree <- rxDTree(tip_percent ~ pickup_nb + dropoff_nb + pickup_dow + pickup_hour, data = mht_split$train, pruneCp = "auto", reportProgress = 0)) system.time(dforest <- rxDForest(tip_percent ~ pickup_nb + dropoff_nb + pickup_dow + pickup_hour, mht_split$train, nTree = 10, importance = TRUE, useSparseCube = TRUE, reportProgress = 0)) trained.models <- list(linmod = linmod, dtree = dtree, dforest = dforest) save(trained.models, file = 'trained_models.Rdata') ##################################################################### # 4.2f comparing predictions ##################################################################### pred_df <- expand.grid(ll) pred_df_1 <- rxPredict(trained.models$linmod, data = pred_df, predVarNames = "pred_linmod") pred_df_2 <- rxPredict(trained.models$dtree, data = pred_df, predVarNames = "pred_dtree") pred_df_3 <- rxPredict(trained.models$dforest, data = pred_df, predVarNames = "pred_dforest") pred_df <- do.call(cbind, list(pred_df, pred_df_1, pred_df_2, pred_df_3)) head(pred_df) observed_df <- rxSummary(tip_percent ~ pickup_nb:dropoff_nb:pickup_dow:pickup_hour, mht_xdf) observed_df <- observed_df$categorical[[1]][ , c(2:6)] pred_df <- inner_join(pred_df, observed_df, by = names(pred_df)[1:4]) ggplot(data = pred_df) + geom_density(aes(x = Means, col = "observed average")) + geom_density(aes(x = pred_linmod, col = "linmod")) + geom_density(aes(x = pred_dtree, col = "dtree")) + geom_density(aes(x = pred_dforest, col = "dforest")) + xlim(-1, 30) + xlab("tip percent") ##################################################################### # 4.2g judging predictive performance ##################################################################### rxPredict(trained.models$linmod, data = mht_split$test, outData = mht_split$test, predVarNames = "tip_percent_pred_linmod", overwrite = TRUE) rxPredict(trained.models$dtree, data = mht_split$test, outData = mht_split$test, predVarNames = "tip_percent_pred_dtree", overwrite = TRUE) rxPredict(trained.models$dforest, data = mht_split$test, outData = mht_split$test, predVarNames = "tip_percent_pred_dforest", overwrite = TRUE) rxSummary(~ SSE_linmod + SSE_dtree + SSE_dforest, data = mht_split$test, transforms = list(SSE_linmod = (tip_percent - tip_percent_pred_linmod)^2, SSE_dtree = (tip_percent - tip_percent_pred_dtree)^2, SSE_dforest = (tip_percent - tip_percent_pred_dforest)^2)) rxc <- rxCor( ~ tip_percent + tip_percent_pred_linmod + tip_percent_pred_dtree + tip_percent_pred_dforest, data = mht_split$test) print(rxc)