More Related Content

More from Takeshi Arabiki(17)

Recently uploaded(20)

Liqid: Composable CXL PreviewLiqid: Composable CXL Preview
Liqid: Composable CXL Preview
CXL Forum118 views
CXL at OCPCXL at OCP
CXL at OCP
CXL Forum183 views
Green Leaf Consulting: Capabilities DeckGreen Leaf Consulting: Capabilities Deck
Green Leaf Consulting: Capabilities Deck
GreenLeafConsulting170 views
Java Platform Approach 1.0 - Picnic MeetupJava Platform Approach 1.0 - Picnic Meetup
Java Platform Approach 1.0 - Picnic Meetup
Rick Ossendrijver23 views

Rデータフレーム自由自在

  • 1. R Tsukuba.R #9 (2011/11/12) @a_bicky
  • 2. • Takeshi Arabiki 1 ‣ Twitter: @a_bicky ‣ : id:a_bicky • R • http://d.hatena.ne.jp/a_bicky/
  • 3. • Takeshi Arabiki 1 ‣ Twitter: @a_bicky ‣ : id:a_bicky • R SciPy • http://d.hatena.ne.jp/a_bicky/
  • 4. Osaka.R #4 Tokyo.R #16 http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336
  • 5. • • R 8 ,9 • • • • http://www.amazon.co.jp/gp/product/4431712186
  • 7. reshape2 > install.packages("reshape2") > library(reshape2) > head(tips) # total_bill tip sex smoker day time size 1 16.99 1.01 Female No Sun Dinner 2 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3 4 23.68 3.31 Male No Sun Dinner 2 5 24.59 3.61 Female No Sun Dinner 4 6 25.29 4.71 Male No Sun Dinner 4
  • 8. tips total_bill: tip: sex: Male, Female smoker: Yes, No day: Thur, Fri, Sat, Sun time: Lunch, Dinner size:
  • 10. • • • subset • cbind, [, $, [[ • transform, within • • subset • cbind, [, $, [[ • transform, within • • order •
  • 11. > class(tips) [1] "data.frame" > mode(tips) # data.frame list [1] "list" > head(tips[["total_bill"]]) # list [1] 16.99 10.34 21.01 23.68 24.59 25.29 > head(tips$total_bill) # [1] 16.99 10.34 21.01 23.68 24.59 25.29 > head(tips["total_bill"]) # data.frame total_bill 1 16.99 2 10.34 3 21.01 4 23.68 5 24.59 6 25.29
  • 12. > head(tips[c("total_bill", "tip")]) # total_bill tip 1 16.99 1.01 2 10.34 1.66 3 21.01 3.50 4 23.68 3.31 5 24.59 3.61 6 25.29 4.71 > head(tips[[c("total_bill", "tip")]]) # Error in .subset2(x, i, exact = exact) : subscript out of bounds > tips[[c(1, 2)]] # tips[[1]][[2]] [1] 10.34
  • 13. > tips[1:2, 1:2] # total_bill tip 1 16.99 1.01 2 10.34 1.66 > tips[1:2, c("total_bill", "tip")] # total_bill tip 1 16.99 1.01 2 10.34 1.66 > head(tips[-(1:2), -(1:2)]) # sex smoker day time size 3 Male No Sun Dinner 3 4 Male No Sun Dinner 2 5 Female No Sun Dinner 4 6 Male No Sun Dinner 4 7 Male No Sun Dinner 2 8 Male No Sun Dinner 4
  • 14. subset > args(subset.data.frame) function (x, subset, select, drop = FALSE, ...) NULL > (tips.vip <- subset(tips, total_bill > 30 & size == 2)) total_bill tip sex smoker day time size 84 32.68 5.00 Male Yes Thur Lunch 2 174 31.85 3.18 Male Yes Sun Dinner 2 176 32.90 3.11 Male Yes Sun Dinner 2 180 34.63 3.55 Male Yes Sun Dinner 2 185 40.55 3.00 Male Yes Sun Dinner 2 238 32.83 1.17 Male Yes Sat Dinner 2 > levels(tips.vip$smoker) # [1] "No" "Yes" > levels(droplevels(tips.vip)$smoker) # [1] "Yes"
  • 15. cbind, [, $, [[ > head(cbind(tips, type = ifelse(tips$tip < 2, " ", " ")), 3) total_bill tip sex smoker day time size type 1 16.99 1.01 Female No Sun Dinner 2 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3 > tips$type <- ifelse(tips$tip < 2, " ", " ") > head(tips, 3) total_bill tip sex smoker day time size type 1 16.99 1.01 Female No Sun Dinner 2 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3 > data(tips) #
  • 16. transform, within > args(transform.data.frame) function (`_data`, ...) NULL > head(transform(tips, type = ifelse(tips$tip < 2, " ", " ")), 3) total_bill tip sex smoker day time size type 1 16.99 1.01 Female No Sun Dinner 2 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3 > args(within.data.frame) function (data, expr, ...) NULL > head(within(tips, { type <- c() # within + type[tip < 2] <- " " + type[tip >= 2] <- " " }), 3) total_bill tip sex smoker day time size type 1 16.99 1.01 Female No Sun Dinner 2 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3
  • 17. subset > # subset > head(subset(tips, select = c(tip, sex, smoker)), 1) tip sex smoker 1 1.01 Female No > head(subset(tips, select = 2:4), 1) tip sex smoker 1 1.01 Female No > head(subset(tips, select = -c(total_bill, size, time, day)), 1) tip sex smoker 1 1.01 Female No > head(subset(tips, select = -c(1, 5:7)), 1) tip sex smoker 1 1.01 Female No > head(subset(tips, select = c(tip:smoker)), 1) tip sex smoker 1 1.01 Female No > head(subset(tips, select = -c(total_bill, day:size)), 1) tip sex smoker 1 1.01 Female No
  • 18. [, $, [[ > # NULL > tips$size <- NULL > head(tips, 3) total_bill tip sex smoker day time 1 16.99 1.01 Female No Sun Dinner 2 10.34 1.66 Male No Sun Dinner 3 21.01 3.50 Male No Sun Dinner > tips[["time"]] <- NULL > head(tips, 3) total_bill tip sex smoker day 1 16.99 1.01 Female No Sun 2 10.34 1.66 Male No Sun 3 21.01 3.50 Male No Sun > tips["day"] <- NULL; tips[1] <- NULL > head(tips, 3) tip sex smoker 1 1.01 Female No 2 1.66 Male No 3 3.50 Male No > data(tips)
  • 19. transform, within > # NULL > head(transform(tips, total_bill = NULL, size = NULL, time = NULL, day = NULL), 3) tip sex smoker 1 1.01 Female No 2 1.66 Male No 3 3.50 Male No > # rm > head(within(tips, rm(total_bill, size, time, day)), 3) tip sex smoker 1 1.01 Female No 2 1.66 Male No 3 3.50 Male No
  • 20. > head(transform(tips, tip = 10), 3) total_bill tip sex smoker day time size 1 16.99 10 Female No Sun Dinner 2 2 10.34 10 Male No Sun Dinner 3 3 21.01 10 Male No Sun Dinner 3 > head(within(tips, tip <- 10), 3) total_bill tip sex smoker day time size 1 16.99 10 Female No Sun Dinner 2 2 10.34 10 Male No Sun Dinner 3 3 21.01 10 Male No Sun Dinner 3 > tips$tip <- 10 > head(tips, 3) total_bill tip sex smoker day time size 1 16.99 10 Female No Sun Dinner 2 2 10.34 10 Male No Sun Dinner 3 3 21.01 10 Male No Sun Dinner 3 > data(tips)
  • 21. order > head(tips[order(tips$sex), ], 4) # total_bill tip sex smoker day time size 1 16.99 1.01 Female No Sun Dinner 2 5 24.59 3.61 Female No Sun Dinner 4 12 35.26 5.00 Female No Sun Dinner 4 15 14.83 3.02 Female No Sun Dinner 2 > head(tips[order(tips$sex, decreasing = TRUE), ], 4) # total_bill tip sex smoker day time size 2 10.34 1.66 Male No Sun Dinner 3 3 21.01 3.50 Male No Sun Dinner 3 4 23.68 3.31 Male No Sun Dinner 2 6 25.29 4.71 Male No Sun Dinner 4 > head(tips[order(tips$sex, tips$tip), ], 4) # total_bill tip sex smoker day time size 68 3.07 1.00 Female Yes Sat Dinner 1 93 5.75 1.00 Female Yes Fri Dinner 2 112 7.25 1.00 Female No Sat Dinner 1 1 16.99 1.01 Female No Sun Dinner 2
  • 22. data.frame > (tip <- data.frame(date = sample(seq(as.Date("2011-11-09"), by = "day", len = 4)), + total_bill = sample(1:4 * 10), + tip = sample(1:4))) date total_bill tip 1 2011-11-10 30 4 2 2011-11-12 40 2 3 2011-11-11 10 1 4 2011-11-09 20 3 > # > tip <- tip[order(tip$date), ] > transform(tip, total_bill = cumsum(total_bill), tip = cumsum(tip)) date total_bill tip 4 2011-11-09 20 3 1 2011-11-10 50 7 3 2011-11-11 60 8 2 2011-11-12 100 10
  • 23. > head(tips[c("tip", "total_bill", "sex", "size", "time", "day", "smoker")]) tip total_bill sex size time day smoker 1 10 16.99 Female 2 Dinner Sun No 2 10 10.34 Male 3 Dinner Sun No 3 10 21.01 Male 3 Dinner Sun No 4 10 23.68 Male 2 Dinner Sun No 5 10 24.59 Female 4 Dinner Sun No 6 10 25.29 Male 4 Dinner Sun No
  • 25. • • table • xtabs • aggregate • by
  • 26. > args(colSums) function (x, na.rm = FALSE, dims = 1L) NULL > colSums(subset(tips, select = c(total_bill, tip)), na.rm = TRUE) total_bill tip 4827.77 731.58 > args(colMeans) function (x, na.rm = FALSE, dims = 1L) NULL > colMeans(subset(tips, select = c(total_bill, tip)), na.rm = TRUE) total_bill tip 19.785943 2.998279 > # apply colSums > apply(subset(tips, select = c(total_bill, tip)), 2, sum, na.rm = TRUE) total_bill tip 4827.77 731.58
  • 27. table > args(table) function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no", "ifany", "always"), dnn = list.names(...), deparse.level = 1) NULL > table(subset(tips, select = c(sex, smoker))) smoker sex No Yes Female 54 33 Male 97 60 > # 4 > table(subset(tips, select = c(sex, smoker, day, size))) , , day = Fri, size = 1 smoker sex No Yes Female 0 0 Male 0 1
  • 28. table > args(addmargins) function (A, margin = seq_along(dim(A)), FUN = sum, quiet = FALSE) NULL > # > addmargins(table(subset(tips, select = c(sex, smoker)))) smoker sex No Yes Sum Female 54 33 87 Male 97 60 157 Sum 151 93 244 > # > args(prop.table) function (x, margin = NULL) NULL > prop.table(table(subset(tips, select = c(sex, smoker)))) smoker sex No Yes Female 0.2213115 0.1352459 Male 0.3975410 0.2459016
  • 29. xtabs > args(xtabs) function (formula = ~., data = parent.frame(), subset, sparse = FALSE, na.action, exclude = c(NA, NaN), drop.unused.levels = FALSE) NULL > # > xtabs(~ sex + smoker, tips) smoker sex No Yes Female 54 33 Male 97 60 > # > xtabs(cbind(total_bill, tip) ~ sex + smoker, tips) , , = total_bill smoker sex No Yes Female 977.68 593.27 Male 1919.75 1337.07
  • 30. aggregate > args(aggregate.data.frame) function (x, by, FUN, ..., simplify = TRUE) NULL > # FUN 1 > aggregate(tips[c("total_bill", "tip")], tips[c("sex", "day")], sum) sex day total_bill tip 1 Female Fri 127.31 25.03 2 Male Fri 198.57 26.93 3 Female Sat 551.05 78.45 4 Male Sat 1227.35 181.95 5 Female Sun 357.70 60.61 6 Male Sun 1269.46 186.78 7 Female Thur 534.89 82.42 8 Male Thur 561.44 89.41 > # formula > aggregate(cbind(total_bill, tip) ~ sex + day, tips, sum) sex day total_bill tip 1 Female Fri 127.31 25.03
  • 31. by > args(by) function (data, INDICES, FUN, ..., simplify = TRUE) NULL > # aggregate FUN OK > (ret <- by(tips[c("total_bill", "tip")], tips[c("sex", "day")], range)) sex: Female day: Fri [1] 1.00 22.75 ------------------------------------------------------------ sex: Male day: Fri [1] 1.50 40.17 > # data.frame > cbind(expand.grid(dimnames(ret)), do.call(rbind, ret)) sex day 1 2 1 Female Fri 1.00 22.75 2 Male Fri 1.50 40.17
  • 33. reshape • merge
  • 34. reshape > args(reshape) function (data, varying = NULL, v.names = NULL, timevar = "time", idvar = "id", ids = 1L:NROW(data), times = seq_along(varying[[1L]]), drop = NULL, direction, new.row.names = NULL, sep = ".", split = if (sep == "") { list(regexp = "[A-Za-z][0-9]", include = TRUE) } else { list(regexp = sep, include = FALSE, fixed = TRUE) }) NULL > head(reshape(tips, idvar = c("sex", "smoker", "time", "size"), + timevar = "day", drop = "total_bill", direction = "wide")) sex smoker time size tip.Sun tip.Sat tip.Thur tip.Fri 1 Female No Dinner 2 1.01 2.75 3 3.25 2 Male No Dinner 3 1.66 3.35 NA NA 4 Male No Dinner 2 3.31 4.08 NA 3.50 5 Female No Dinner 4 3.61 2.45 NA NA 6 Male No Dinner 4 4.71 7.58 NA NA 17 Female No Dinner 3 1.67 3.07 NA NA
  • 35. reshape > # idvar timevar > (a <- data.frame(a = c(1:3, 1), b = c(1:3, 1), c = 1:4)) a b c 1 1 1 1 2 2 2 2 3 3 3 3 4 1 1 4 > reshape(a, idvar = "a", timevar = "b", direction = "wide") a c.1 c.2 c.3 1 1 1 NA NA 2 2 NA 2 NA 3 3 NA NA 3
  • 36. merge > # > (user.type <- data.frame(sex = rep(c("Male", "Female"), each = 2), + smoker = c("Yes", "No"), + type = LETTERS[1:4])) sex smoker type 1 Male Yes A 2 Male No B 3 Female Yes C 4 Female No D > args(merge.data.frame) function (x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), incomparables = NULL, ...) NULL > merge(tips, user.type, by = c("sex", "smoker"), sort = FALSE)[54:55, ] sex smoker total_bill tip day time size type 54 Female No 10.65 1.50 Thur Lunch 2 D 55 Male No 10.27 1.71 Sun Dinner 2 B
  • 38. • • R • reshape2 • melt • cast •
  • 39. Excel
  • 40. R > acast(melt(tips, id.var = c("sex", "smoker", "day"), measure.var = "tip"), + sex + smoker ~ day, sum, margins = TRUE) Fri Sat Sun Thur (all) Female_No 6.25 35.42 46.61 61.49 149.77 Female_Yes 18.78 43.03 14.00 18.93 94.74 Female_(all) 25.03 78.45 60.61 80.42 244.51 Male_No 5.00 104.21 133.96 58.83 302.00 Male_Yes 21.93 77.74 52.82 30.58 183.07 Male_(all) 26.93 181.95 186.78 89.41 485.07 (all)_(all) 51.96 260.40 247.39 169.83 729.58 reshape2
  • 41. reshape2 melt cast melt id > head(tipsm <- melt(tips, measure.vars = c("total_bill", "tip"))) sex smoker day time size variable value 1 Female No Sun Dinner 2 total_bill 16.99 2 Male No Sun Dinner 3 total_bill 10.34 3 Male No Sun Dinner 3 total_bill 21.01 4 Male No Sun Dinner 2 total_bill 23.68 5 Female No Sun Dinner 4 total_bill 24.59 6 Male No Sun Dinner 4 total_bill 25.29 > levels(tipsm$variable) [1] "total_bill" "tip"
  • 42. melt > args(melt.data.frame) function (data, id.vars, measure.vars, variable_name = "variable", na.rm = !preserve.na, preserve.na = TRUE, ...) NULL > # factor id > head(melt(tips), 1) Using sex, smoker, day, time as id variables sex smoker day time variable value 1 Female No Sun Dinner total_bill 16.99 > # id measure > head(melt(tips, id.vars = c("sex", "smoker", "day", "time", "size")), 1) sex smoker day time size variable value 1 Female No Sun Dinner 2 total_bill 16.99 > # id measure > head(melt(tips, id.vars = c("sex", "smoker", "day", "time", "size"), + measure.vars = "tip"), 1) sex smoker day time size variable value 1 Female No Sun Dinner 2 tip 1.01
  • 43. cast formula fun.aggregate > args(acast) # array acast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL > args(dcast) # data.frame dcast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL formula ... . acast hoge ~ fuga ~ piyo ※dcast 1 hoge ~ fuga + piyo
  • 44. > tipsm <- melt(tips, measure.vars = c("total_bill", "tip")) > acast(tipsm, sex ~ smoker, length) No Yes Female 108 64 Male 194 120 > # > acast(tipsm, smoker ~ sex, length) Female Male No 108 194 Yes 64 120 > # > acast(tipsm, sex ~ smoker, length, margins = TRUE) No Yes (all) Female 108 64 172 Male 194 120 314 (all) 302 184 486
  • 45. > # size > acast(tipsm, smoker ~ sex + size, length) Female_1 Female_2 Female_3 Female_4 Female_5 Female_6 Male_1 Male_2 Male_3 No 4 66 18 14 2 4 0 114 34 Yes 2 48 10 4 0 0 2 82 14 Male_4 Male_5 Male_6 No 38 4 4 Yes 18 4 0 > # 3 > acast(tipsm, smoker ~ sex ~ size, length) , , 1 Female Male No 4 0 Yes 2 2
  • 46. > # sum > acast(tipsm, sex ~ day, sum) Fri Sat Sun Thur Female 152.34 629.5 418.31 617.31 total_bill tip Male 225.50 1409.3 1456.24 650.85 > # total_bill tip sum > acast(tipsm, sex + variable ~ day, sum) Fri Sat Sun Thur Female_total_bill 127.31 551.05 357.70 534.89 Female_tip 25.03 78.45 60.61 82.42 Male_total_bill 198.57 1227.35 1269.46 561.44 Male_tip 26.93 181.95 186.78 89.41 > # tip sum > acast(tipsm, sex ~ day, sum, subset = .(variable == "tip")) Fri Sat Sun Thur Female 25.03 78.45 60.61 82.42 Male 26.93 181.95 186.78 89.41
  • 48. reshape2 aggregate table xtabs