Rデータフレーム自由自在

R
Tsukuba.R #9 (2011/11/12)
@a_bicky

• Takeshi Arabiki 1

‣ Twitter: @a_bicky
‣ : id:a_bicky

•
R

•
http://d.hatena.ne.jp/a_bicky/

• Takeshi Arabiki 1

‣ Twitter: @a_bicky
‣ : id:a_bicky

•
R SciPy

•
http://d.hatena.ne.jp/a_bicky/

Osaka.R #4 Tokyo.R #16

http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336

•
• R 8 ,9

•
•
•
•

http://www.amazon.co.jp/gp/product/4431712186

reshape2
> install.packages("reshape2")
> library(reshape2)
> head(tips) #
total_bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3

tips

total_bill:
tip:
sex: Male, Female
smoker: Yes, No
day: Thur, Fri, Sat, Sun
time: Lunch, Dinner
size:

•
•
• subset
• cbind, [, $, [[
• transform, within
•
• subset
• cbind, [, $, [[
• transform, within
•
• order
•

> class(tips)
[1] "data.frame"
> mode(tips) # data.frame list
[1] "list"
> head(tips[["total_bill"]]) # list
[1] 16.99 10.34 21.01 23.68 24.59 25.29
> head(tips$total_bill) #
[1] 16.99 10.34 21.01 23.68 24.59 25.29
> head(tips["total_bill"]) # data.frame
total_bill
1 16.99
2 10.34
3 21.01
4 23.68
5 24.59
6 25.29

> head(tips[c("total_bill", "tip")]) #
total_bill tip
1 16.99 1.01
2 10.34 1.66
3 21.01 3.50
4 23.68 3.31
5 24.59 3.61
6 25.29 4.71
> head(tips[[c("total_bill", "tip")]]) #
Error in .subset2(x, i, exact = exact) : subscript out of bounds
> tips[[c(1, 2)]] # tips[[1]][[2]]
[1] 10.34

> tips[1:2, 1:2] #
total_bill tip
1 16.99 1.01
2 10.34 1.66
> tips[1:2, c("total_bill", "tip")] #
total_bill tip
1 16.99 1.01
2 10.34 1.66
> head(tips[-(1:2), -(1:2)]) #
sex smoker day time size
3 Male No Sun Dinner 3
5 Female No Sun Dinner 4

subset

> args(subset.data.frame)
function (x, subset, select, drop = FALSE, ...)
NULL
> (tips.vip <- subset(tips, total_bill > 30 & size == 2))
84 32.68 5.00 Male Yes Thur Lunch 2
174 31.85 3.18 Male Yes Sun Dinner 2
238 32.83 1.17 Male Yes Sat Dinner 2
> levels(tips.vip$smoker) #
[1] "No" "Yes"
> levels(droplevels(tips.vip)$smoker) #
[1] "Yes"

cbind, [, $, [[

> head(cbind(tips, type = ifelse(tips$tip < 2, " ", " ")), 3)
total_bill tip sex smoker day time size type
> tips$type <- ifelse(tips$tip < 2, " ", " ")
> head(tips, 3)
> data(tips) #

transform, within

> args(transform.data.frame)
function (`_data`, ...)
NULL
> head(transform(tips, type = ifelse(tips$tip < 2, " ", " ")), 3)
> args(within.data.frame)
function (data, expr, ...)
NULL
> head(within(tips, { type <- c() # within
+ type[tip < 2] <- " "
+ type[tip >= 2] <- " " }), 3)

subset

> # subset
> head(subset(tips, select = c(tip, sex, smoker)), 1)
tip sex smoker
1 1.01 Female No
> head(subset(tips, select = 2:4), 1)
tip sex smoker
1 1.01 Female No
> head(subset(tips, select = -c(total_bill, size, time, day)), 1)
tip sex smoker
1 1.01 Female No
> head(subset(tips, select = -c(1, 5:7)), 1)
tip sex smoker
1 1.01 Female No
> head(subset(tips, select = c(tip:smoker)), 1)
tip sex smoker
1 1.01 Female No
> head(subset(tips, select = -c(total_bill, day:size)), 1)
tip sex smoker
1 1.01 Female No

[, $, [[

> # NULL
> tips$size <- NULL
> head(tips, 3)
total_bill tip sex smoker day time
1 16.99 1.01 Female No Sun Dinner
2 10.34 1.66 Male No Sun Dinner
3 21.01 3.50 Male No Sun Dinner
> tips[["time"]] <- NULL
> head(tips, 3)
total_bill tip sex smoker day
1 16.99 1.01 Female No Sun
2 10.34 1.66 Male No Sun
3 21.01 3.50 Male No Sun
> tips["day"] <- NULL; tips[1] <- NULL
> head(tips, 3)
tip sex smoker
1 1.01 Female No
2 1.66 Male No
3 3.50 Male No
> data(tips)

transform, within

> # NULL
> head(transform(tips, total_bill = NULL, size = NULL, time = NULL, day =
NULL), 3)
tip sex smoker
1 1.01 Female No
2 1.66 Male No
3 3.50 Male No
> # rm
> head(within(tips, rm(total_bill, size, time, day)), 3)
tip sex smoker
1 1.01 Female No
2 1.66 Male No
3 3.50 Male No

> head(transform(tips, tip = 10), 3)
1 16.99 10 Female No Sun Dinner 2
2 10.34 10 Male No Sun Dinner 3
> head(within(tips, tip <- 10), 3)
> tips$tip <- 10
> head(tips, 3)
> data(tips)

order

> head(tips[order(tips$sex), ], 4) #
> head(tips[order(tips$sex, decreasing = TRUE), ], 4) #
> head(tips[order(tips$sex, tips$tip), ], 4) #
68 3.07 1.00 Female Yes Sat Dinner 1
93 5.75 1.00 Female Yes Fri Dinner 2
112 7.25 1.00 Female No Sat Dinner 1

data.frame

> (tip <- data.frame(date = sample(seq(as.Date("2011-11-09"), by = "day", len = 4)),
+ total_bill = sample(1:4 * 10),
+ tip = sample(1:4)))
date total_bill tip
1 2011-11-10 30 4
2 2011-11-12 40 2
3 2011-11-11 10 1
4 2011-11-09 20 3
> #
> tip <- tip[order(tip$date), ]
> transform(tip, total_bill = cumsum(total_bill), tip = cumsum(tip))
date total_bill tip
4 2011-11-09 20 3
1 2011-11-10 50 7
3 2011-11-11 60 8
2 2011-11-12 100 10

> head(tips[c("tip", "total_bill", "sex", "size", "time", "day", "smoker")])
tip total_bill sex size time day smoker
1 10 16.99 Female 2 Dinner Sun No
2 10 10.34 Male 3 Dinner Sun No
5 10 24.59 Female 4 Dinner Sun No

•
• table
• xtabs
• aggregate
• by

> args(colSums)
function (x, na.rm = FALSE, dims = 1L)
NULL
> colSums(subset(tips, select = c(total_bill, tip)), na.rm = TRUE)
total_bill tip
4827.77 731.58
> args(colMeans)
function (x, na.rm = FALSE, dims = 1L)
NULL
> colMeans(subset(tips, select = c(total_bill, tip)), na.rm = TRUE)
total_bill tip
19.785943 2.998279
> # apply colSums
> apply(subset(tips, select = c(total_bill, tip)), 2, sum, na.rm = TRUE)
total_bill tip
4827.77 731.58

table

> args(table)
function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no",
"ifany", "always"), dnn = list.names(...), deparse.level = 1)
NULL
> table(subset(tips, select = c(sex, smoker)))
smoker
sex No Yes
Female 54 33
Male 97 60
> # 4
> table(subset(tips, select = c(sex, smoker, day, size)))
, , day = Fri, size = 1

smoker
sex No Yes
Female 0 0
Male 0 1

table

> args(addmargins)
function (A, margin = seq_along(dim(A)), FUN = sum, quiet = FALSE)
NULL
> #
> addmargins(table(subset(tips, select = c(sex, smoker))))
smoker
sex No Yes Sum
Female 54 33 87
Male 97 60 157
Sum 151 93 244
> #
> args(prop.table)
function (x, margin = NULL)
NULL
> prop.table(table(subset(tips, select = c(sex, smoker))))
smoker
sex No Yes
Female 0.2213115 0.1352459
Male 0.3975410 0.2459016

xtabs

> args(xtabs)
function (formula = ~., data = parent.frame(), subset, sparse = FALSE,
na.action, exclude = c(NA, NaN), drop.unused.levels = FALSE)
NULL
> #
> xtabs(~ sex + smoker, tips)
smoker
sex No Yes
Female 54 33
Male 97 60
> #
> xtabs(cbind(total_bill, tip) ~ sex + smoker, tips)
, , = total_bill

smoker
sex No Yes
Female 977.68 593.27
Male 1919.75 1337.07

aggregate

> args(aggregate.data.frame)
function (x, by, FUN, ..., simplify = TRUE)
NULL
> # FUN 1
> aggregate(tips[c("total_bill", "tip")], tips[c("sex", "day")], sum)
sex day total_bill tip
1 Female Fri 127.31 25.03
2 Male Fri 198.57 26.93
3 Female Sat 551.05 78.45
4 Male Sat 1227.35 181.95
5 Female Sun 357.70 60.61
6 Male Sun 1269.46 186.78
7 Female Thur 534.89 82.42
8 Male Thur 561.44 89.41
> # formula
> aggregate(cbind(total_bill, tip) ~ sex + day, tips, sum)
sex day total_bill tip
1 Female Fri 127.31 25.03

by

> args(by)
function (data, INDICES, FUN, ..., simplify = TRUE)
NULL
> # aggregate FUN OK
> (ret <- by(tips[c("total_bill", "tip")], tips[c("sex", "day")], range))
sex: Female
day: Fri
[1] 1.00 22.75
------------------------------------------------------------
sex: Male
day: Fri
[1] 1.50 40.17

> # data.frame
> cbind(expand.grid(dimnames(ret)), do.call(rbind, ret))
sex day 1 2
1 Female Fri 1.00 22.75
2 Male Fri 1.50 40.17

• reshape
• merge

reshape

> args(reshape)
function (data, varying = NULL, v.names = NULL, timevar = "time",
idvar = "id", ids = 1L:NROW(data), times = seq_along(varying[[1L]]),
drop = NULL, direction, new.row.names = NULL, sep = ".",
split = if (sep == "") {
list(regexp = "[A-Za-z][0-9]", include = TRUE)
} else {
list(regexp = sep, include = FALSE, fixed = TRUE)
})
NULL
> head(reshape(tips, idvar = c("sex", "smoker", "time", "size"),
+ timevar = "day", drop = "total_bill", direction = "wide"))
sex smoker time size tip.Sun tip.Sat tip.Thur tip.Fri
1 Female No Dinner 2 1.01 2.75 3 3.25
2 Male No Dinner 3 1.66 3.35 NA NA
4 Male No Dinner 2 3.31 4.08 NA 3.50
5 Female No Dinner 4 3.61 2.45 NA NA
6 Male No Dinner 4 4.71 7.58 NA NA
17 Female No Dinner 3 1.67 3.07 NA NA

reshape

> # idvar timevar
> (a <- data.frame(a = c(1:3, 1), b = c(1:3, 1), c = 1:4))
a b c
1 1 1 1
2 2 2 2
3 3 3 3
4 1 1 4
> reshape(a, idvar = "a", timevar = "b", direction = "wide")
a c.1 c.2 c.3
1 1 1 NA NA
2 2 NA 2 NA
3 3 NA NA 3

merge

> #
> (user.type <- data.frame(sex = rep(c("Male", "Female"), each = 2),
+ smoker = c("Yes", "No"),
+ type = LETTERS[1:4]))
sex smoker type
1 Male Yes A
2 Male No B
3 Female Yes C
4 Female No D
> args(merge.data.frame)
function (x, y, by = intersect(names(x), names(y)), by.x = by,
by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE,
suffixes = c(".x", ".y"), incomparables = NULL, ...)
NULL
> merge(tips, user.type, by = c("sex", "smoker"), sort = FALSE)[54:55, ]
sex smoker total_bill tip day time size type
54 Female No 10.65 1.50 Thur Lunch 2 D
55 Male No 10.27 1.71 Sun Dinner 2 B

•
• R
• reshape2
• melt
• cast
•

R

> acast(melt(tips, id.var = c("sex", "smoker", "day"), measure.var = "tip"),
+ sex + smoker ~ day, sum, margins = TRUE)
Fri Sat Sun Thur (all)
Female_No 6.25 35.42 46.61 61.49 149.77
Female_Yes 18.78 43.03 14.00 18.93 94.74
Female_(all) 25.03 78.45 60.61 80.42 244.51
Male_No 5.00 104.21 133.96 58.83 302.00
Male_Yes 21.93 77.74 52.82 30.58 183.07
Male_(all) 26.93 181.95 186.78 89.41 485.07
(all)_(all) 51.96 260.40 247.39 169.83 729.58

reshape2

reshape2

melt cast

melt
id
> head(tipsm <- melt(tips, measure.vars = c("total_bill", "tip")))
sex smoker day time size variable value
1 Female No Sun Dinner 2 total_bill 16.99
2 Male No Sun Dinner 3 total_bill 10.34
> levels(tipsm$variable)
[1] "total_bill" "tip"

melt
> args(melt.data.frame)
function (data, id.vars, measure.vars, variable_name = "variable",
na.rm = !preserve.na, preserve.na = TRUE, ...)
NULL
> # factor id
> head(melt(tips), 1)
Using sex, smoker, day, time as id variables
sex smoker day time variable value
1 Female No Sun Dinner total_bill 16.99
> # id measure
> head(melt(tips, id.vars = c("sex", "smoker", "day", "time", "size")), 1)
> # id measure
> head(melt(tips, id.vars = c("sex", "smoker", "day", "time", "size"),
+ measure.vars = "tip"), 1)
1 Female No Sun Dinner 2 tip 1.01

cast
formula fun.aggregate
> args(acast) # array acast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL
> args(dcast) # data.frame dcast
function (data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data))
NULL

formula
...
.
acast hoge ~ fuga ~ piyo
※dcast 1 hoge ~ fuga + piyo

> tipsm <- melt(tips, measure.vars = c("total_bill", "tip"))
> acast(tipsm, sex ~ smoker, length)
No Yes
Female 108 64
Male 194 120
> #
> acast(tipsm, smoker ~ sex, length)
Female Male
No 108 194
Yes 64 120
> #
> acast(tipsm, sex ~ smoker, length, margins = TRUE)
No Yes (all)
Female 108 64 172
Male 194 120 314
(all) 302 184 486

> # size
> acast(tipsm, smoker ~ sex + size, length)
Female_1 Female_2 Female_3 Female_4 Female_5 Female_6 Male_1 Male_2
Male_3
No 4 66 18 14 2 4 0 114
34
Yes 2 48 10 4 0 0 2 82
14
Male_4 Male_5 Male_6
No 38 4 4
Yes 18 4 0
> # 3
> acast(tipsm, smoker ~ sex ~ size, length)
, , 1

Female Male
No 4 0
Yes 2 2

> # sum
> acast(tipsm, sex ~ day, sum)
Fri Sat Sun Thur
Female 152.34 629.5 418.31 617.31 total_bill tip
Male 225.50 1409.3 1456.24 650.85
> # total_bill tip sum
> acast(tipsm, sex + variable ~ day, sum)
Fri Sat Sun Thur
Female_total_bill 127.31 551.05 357.70 534.89
Female_tip 25.03 78.45 60.61 82.42
Male_total_bill 198.57 1227.35 1269.46 561.44
Male_tip 26.93 181.95 186.78 89.41
> # tip sum
> acast(tipsm, sex ~ day, sum, subset = .(variable == "tip"))
Fri Sat Sun Thur
Female 25.03 78.45 60.61 82.42
Male 26.93 181.95 186.78 89.41

reshape2 aggregate table xtabs

Rデータフレーム自由自在

Rデータフレーム自由自在

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (13)

More from Takeshi Arabiki

More from Takeshi Arabiki (16)

Recently uploaded

Recently uploaded (20)

Rデータフレーム自由自在