plyr split-apply-combine for mortals sean anderson - - PowerPoint PPT Presentation

plyr
SMART_READER_LITE
LIVE PREVIEW

plyr split-apply-combine for mortals sean anderson - - PowerPoint PPT Presentation

plyr split-apply-combine for mortals sean anderson sean_anderson@sfu.ca why? 1. its everywhere 2. less code, simple syntax 3. it runs faster look familiar? > d year count 1 2000 16 2 2000 4 3 2000 12 4 2001 15


slide-1
SLIDE 1

plyr

split-apply-combine for mortals

sean anderson sean_anderson@sfu.ca

slide-2
SLIDE 2
  • 1. it’s everywhere
  • 2. less code, simple syntax
  • 3. it runs faster

why?

slide-3
SLIDE 3

look familiar?

> d year count 1 2000 16 2 2000 4 3 2000 12 4 2001 15 5 2001 7 6 2001 12 7 2002 20 ...

slide-4
SLIDE 4

less code subsetting saving results faster

why apply > for loop?

slide-5
SLIDE 5

> d year count 1 2000 16 2 2000 4 3 2000 12 4 2001 15 5 2001 7 6 2001 12 7 2002 20 ...

slide-6
SLIDE 6

year mean 1 2000 10.66667 2 2001 11.33333 3 2002 13.66667

slide-7
SLIDE 7

d.split <- split(d, d$year) results <- vector("list", length = length(d.split)) for(i in 1:length(d.split)) { temp <- d.split[[i]] temp.mean <- mean(temp$count) results[[i]] <- data.frame( year = unique(temp$year), mean = temp.mean) } do.call("rbind", results)

inspired by Hadley Wickham: http://had.co.nz/plyr/

slide-8
SLIDE 8

apply(array, 1 or 2, func) sapply(vector, func) lapply(list, func) tapply(vector, index, func) aggregate(object, by, func) ...

slide-9
SLIDE 9

d.split <- split(d, d$year) result <- lapply(d.split, function(x) mean(x$count)) result <- unlist(result) result <- data.frame(year = unique(d$year), mean = result) row.names(result) <- NULL

slide-10
SLIDE 10

enter plyr

slide-11
SLIDE 11

ddply(d, "year", summarize, mean = mean(count))

slide-12
SLIDE 12

d.split <- split(d, d$year) results <- vector("list", length = length(d.split)) for(i in 1:length(d.split)) { temp <- d.split[[i]] temp.mean <- mean(temp$count) results[[i]] <- data.frame( year = unique(temp$year), mean = temp.mean) } do.call("rbind", results)

slide-13
SLIDE 13

ddply()

  • utput

input

slide-14
SLIDE 14

d - data frame l - list a - array _ - discard

slide-15
SLIDE 15

ddply(data, "split", function)

slide-16
SLIDE 16

ddply(d, "year", summarise, mean.count = mean(count))

slide-17
SLIDE 17

year mean 1 2000 10.66667 2 2001 11.33333 3 2002 13.66667

slide-18
SLIDE 18

ddply(d, "year", transform, total.count = sum(count))

slide-19
SLIDE 19

year count total 1 2000 16 32 2 2000 4 32 3 2000 12 32 4 2001 15 34 5 2001 7 34 6 2001 12 34 7 2002 20 41 8 2002 15 41 9 2002 6 41

slide-20
SLIDE 20

ddply(d, "year", function(x) { browser() }) Browse[1]> x year count 1 2000 16 2 2000 4 3 2000 12 Browse[1]> Q >

slide-21
SLIDE 21

library(doMC) registerDoMC(2) # 2 cores ddply(d, f, .parallel = TRUE))

slide-22
SLIDE 22

failwith(default, f) # fail gracefully:

slide-23
SLIDE 23
  • 1. it’s everywhere
  • 2. less code, simple syntax
  • 3. it runs faster (sometimes)

remember use it.