RではじめるTwitter解析

R       Twitter
    R      2011 (2011/11/26)
                  @a_bicky
• Takeshi Arabiki
    ‣

    ‣ Twitter &          : @a_bicky & id:a_bicky

•
                                R

•
                  http://d.hatena.ne.jp/a_bicky/
R
           Osaka.R #4                               Tokyo.R #16                               Tsukuba.R #9




http://www.slideshare.net/abicky/twitterr   http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
※
RではじめるTwitter解析
Twitter
Mentionmapp
Mentionmapp
Mentionmapp
http://twilog.org/   http://twitraq.userlocal.jp/




http://whotwi.com/
                                 http://tweetstats.com/
http://twilog.org/   http://twitraq.userlocal.jp/




    R

http://whotwi.com/
                                 http://tweetstats.com/
Twitter


•
•             reshape2
•               ggplot2
•
Twitter


•
•             reshape2
•               ggplot2
•
RではじめるTwitter解析
twitteR
      twitteR
> library(twitteR) # twitteR
> #                          (twitteR 0.99.15     )
> Sys.setlocale("LC_TIME", "C")
[1] "C"
> # @a_bicky         3,200          RT
> statuses <- userTimeline("a_bicky", n = 3200)
status
> #             R5
> ls.str(statuses[[1]])
created : POSIXct[1:1], format: "2011-11-23 22:16:24"
favorited : logi FALSE           ↑            UTC
id : chr "139467359571296256"
initFields : Formal class 'refMethodDef' [package "methods"]
with 5 slots
initialize : Formal class 'refMethodDef' [package "methods"]
with 5 slots
replyToSID : chr(0)
replyToSN : chr(0)
replyToUID : chr(0)
screenName : chr "a_bicky"     ! Twitter
statusSource : chr "<a href="http://sites.google.com/site/
yorufukurou/" rel="nofollow">YoruFukurou</a>"
text : chr "                                               "
truncated :   logi FALSE             ↑
> statusDF <- twListToDF(statuses)
> str(statusDF, vec.len = 1)
'data.frame':	 3159 obs. of 10 variables:
 $ text        : chr "
         " ...                     ↑

 $ favorited   : logi FALSE ...
 $ replyToSN   : logi NA ...
 $ created     : POSIXct, format: "2011-11-23 22:16:24" ...
 $ truncated   : logi FALSE ...      ↑           UTC
 $ replyToSID : logi NA ...
 $ id          : chr "139467359571296256" ...
 $ replyToUID : logi NA ...
 $ statusSource: chr "<a href="http://sites.google.com/
site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ...
 $ screenName : chr "a_bicky" ...
> wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
> statusDF <- within(statusDF, {
+     attr(created, "tzone") <- "Asia/Tokyo" # JST
+     statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1",
statusSource)) # HTML
+     date <- factor(format(created, "%Y-%m-%d")) #
+     hour <- NULL; month <- NULL; year <- NULL; wday <- NULL
+     with(as.POSIXlt(created), {
+         hour <<- factor(hour)         #
+         month <<- factor(mon + 1)     #
+         year <<- factor(year + 1900) #
+         wday <<- factor((wday + 6) %% 7, labels = wday.abb) #
+     })
+     textLength <- nchar(text) #
+     #        , URL,
+     cleanText <- removeSpecialStr(text)
+     cleanTextLength <- nchar(cleanText) # URL
+ })
> #                  Twitter
> topSources <- names(head(sort(table(statusDF$statusSource),
decreasing = TRUE), 5))
> statusDF <- within(statusDF, {
+     statusSource <- as.character(statusSource)
+     statusSource[!statusSource %in% topSources] <- "other"
+     #
+     statusSource <- factor(statusSource, levels = names(sort(table
(statusSource), dec = TRUE)))
+ })
Twitter


•
•             reshape2
•               ggplot2
•
reshape2
RではじめるTwitter解析
Excel




9   11   ”Twitter for iPhone”, ”YoruFukurou”
    Sat Mon 12         23
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000
reshape2
> library(reshape2)
> acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"),
+             measure.vars = c("textLength")),
+       month + statusSource ~ wday, mean,
+       subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone")
+         & month %in% 9:11 & hour %in% 12:23
+         & wday %in% c("Mon", "Sat", "Sun")))
                       Mon      Sat      Sun
9_YoruFukurou           43 42.13333 54.76471
9_Twitter for iPhone    16 27.70000 20.50000
10_YoruFukurou          61 41.70175 56.98333
10_Twitter for iPhone NaN 27.00000 24.50000
11_YoruFukurou          35 41.08197 57.32609
11_Twitter for iPhone NaN       NaN 32.00000




         R
reshape2                                 melt
  melt                                    cast
   melt
cast
> mstatus <- melt(statusDF,
+    id.vars = c("statusSource", "wday", "year", "month", "hour", "date"),
+    measure.vars = c("textLength", "cleanTextLength"))
> mstatus[3157:3162, ]
      statusSource wday year month hour       date        variable value
3157           web Sun 2011      3   20 2011-03-13      textLength    72
3158           web Sun 2011      3   16 2011-03-13      textLength    24
3159           web Sun 2011      3   14 2011-03-13      textLength    82
3160 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    87
3161 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    14
3162 YoruFukurou Wed 2011       11    1 2011-11-23 cleanTextLength    21



              id
reshape2                                    cast
      cast
formula                                     fun.aggregate
> args(acast) #         array                       acast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL
> args(dcast) #         data.frame                          dcast
function (data, formula, fun.aggregate    = NULL, ..., margins = NULL,
     subset = NULL, fill = NULL, drop =   TRUE, value_var = guess_value(data))
NULL


formula
...
.
acast     hoge ~ fuga ~ piyo
※dcast       1                            hoge ~ fuga + piyo
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
                                    ↑            cleanTextLength
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
>
> #
> acast(mstatus,   . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed   Thu Fri Sat Sun
[1,] 408 360 258   294 334 801 704
> #
> acast(mstatus,   hour ~ wday, length, subset = .(variable ==
"textLength"))
> #
> acast(mstatus, . ~ wday, length, subset = .(variable == "textLength"))
     Mon Tue Wed Thu Fri Sat Sun
[1,] 408 360 258 294 334 801 704
> #
> acast(mstatus, hour ~ wday, length, subset = .(variable ==
"textLength"))
   Mon Tue Wed Thu Fri Sat Sun
0   65 69 26 46 46 49 40
1   48 19 11 15 27 44 37
2   31 24    6 16 17 23 17
3   27 19    4 11 14 17 10
4    4 15    1   7   4   5   7
5    5 11    1   4   3   4   5
6    4 14    3   6   9   8   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
> #
> #
> acast(mstatus, hour ~ wday + month, length, subset = .(variable ==
"textLength"))
   Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4
0      3     4    13     3     1    10     7     15      9     4     2
1      0     0     1     0     1     9    16     12      9     1     0
2      2     0     0     0     2     7     6      7      7     2     0


> # 3
> acast(mstatus, hour ~ wday ~ month, length, subset = .(variable ==
"textLength"))
, , 3          3

    Mon Tue Wed Thu Fri Sat Sun
0     3   4   1   0   1   6   4
1     0   1   3   0   0   0   1
RではじめるTwitter解析
RではじめるTwitter解析
RではじめるTwitter解析
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
Twitter
                                               reshape2    1


> #
> dcast(mstatus, statusSource ~ .,
+       function(x) list(c(mean = mean(x), sd = sd(x))),
+       fill = list(c(mean = NaN, sd = NA)), ←
+       subset = .(variable == "textLength"))
         statusSource                 NA
1         YoruFukurou 47.51462, 32.57973
2                 web 57.02720, 36.33534
3 Twitter for iPhone 33.42342, 23.06466
4 Twitter for Android 28.49048, 20.08457
5              Hatena 80.00000, 25.94212
6               other 52.58621, 33.12180
>
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
>   #                     t
>   pc <- unlist(subset(statusDF,
+                       statusSource %in% c("YoruFukurou", "web"),
+                       textLength))
>   sp <- unlist(subset(statusDF,
+                       grepl("(iPhone|Android)", statusSource),
+                       textLength))
>   t.test(sp, pc, var.equal = FALSE)

	       Welch Two Sample t-test
                                        !!
data: sp and pc
t = -15.7921, df = 1588.246, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.85334 -15.46645
sample estimates:
mean of x mean of y
 31.83945 49.49935
> extractScreenNames <- function(text, strict = TRUE) {
+     if (strict) {
+         # Twitter     screen_name
+         regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])"
+     } else {
+         #       hoge@example.com
+         regex <- "(?:([@   ])(w+)|[sS])"
+     }
+     screenNames <- gsub(regex, "12", text, perl = TRUE)
+     unique(unlist(strsplit(substring(screenNames, 2), "[@ ]")))
+ }
> screenNames <- unlist(lapply(statusDF$text, extractScreenNames))
> head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10
screenNames
        naopr     __gfx__ hirota_inoue     mandy_44    ask_a_lie
          105          85           51           47           40
    ken_nishi      nokuno      yokkuns   JinJin0613 kanon19_rie
           39          39           33           20           20
Twitter


•
•             reshape2
•               ggplot2
•
ggplot2
ggplot2
plot(statusDF$wday, col = "blue")
                                                                ggplot2




                                qplot(wday, data = statusDF, fill = I("blue"),
                                      alpha = I(0.7), xlab = "", ylab = "")
ggplot2




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
ggplot2
                qplot(wday, data = statusDF, facets = ~ statusSource,
                      fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")




qplot(wday, data = statusDF, fill = statusSource,
      xlab = "", ylab = "")
qplot
      ggplot2
> args(qplot)
function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins =
FALSE,
     geom = "auto", stat = list(NULL), position = list(NULL),
     xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
     xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
     asp = NA)
NULL
qplot   geom
       geom

area:
bar:
histogram:
line:
point:
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin",
        fill = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "bar", stat = "bin",
        fill = statusSource, xlab = "", ylab = "")
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin",
        colour = statusSource, xlab = "", ylab = "", binwidth = 1)
qplot                                geom
       geom

area:
bar:
histogram:
line:
point:




  qplot(wday, data = statusDF, geom = "point", stat = "bin",
        colour = statusSource, xlab = "", ylab = "")
qplot            position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "dodge", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "fill", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "jitter", xlab = "", ylab = "")
qplot                              position
   position       geom


dodge   :
fill    :     1
jitter :
stack   :




  qplot(wday, data = statusDF, fill = statusSource,
        position = "stack", xlab = "", ylab = "")
qplot                           facets
    facets      geom
~           :
        1 ~       2:         1,         2
※reshape2              1 ~        2 +   3
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = ~ statusSource)
qplot                                  facets
     facets       geom
~           :
         1 ~         2:            1,            2
※reshape2                 1 ~           2 +      3




    qplot(wday, data = statusDF, xlab = "", ylab = "",
          facets = month ~ statusSource)
qplot
alpha               :
colour (color) :
fill                :
linetype            :
size                :



colour, fill, linetype           statusSource
                        fill = I("blue")        I   (AsIs)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        alpha = as.integer(wday))
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        colour = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        fill = statusSource)
qplot
alpha          :
colour (color) :
fill           :
linetype       :
size           :




  qplot(wday, data = statusDF, xlab = "", ylab = "",
        linetype = statusSource, colour = statusSource)
RではじめるTwitter解析
RではじめるTwitter解析
whotwi




         http://whotwi.com/
whotwi




         http://whotwi.com/
whotwi
>   #         Twitter
>   #       melt     cast               xtabs
> cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
> head(cnt, 3)
  hour wday statusSource Freq
1    0 Mon YoruFukurou     48
2    1 Mon YoruFukurou     38
3    2 Mon YoruFukurou     25
whotwi
>   #           Twitter
>   #         melt     cast               xtabs
>   cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF))
>   head(cnt, 3)
    hour wday statusSource Freq
1      0 Mon YoruFukurou     48
2      1 Mon YoruFukurou     38
3      2 Mon YoruFukurou     25
>   freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) {
+      #
+      freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE)
[1]])
+      cbind(df[1, c("hour", "wday")], freqSource)
+ })
> freqSources <- do.call(rbind, freqSources)
> head(freqSources, 3)
  hour wday freqSource
1     0 Mon YoruFukurou
2     1 Mon YoruFukurou
3     2 Mon YoruFukurou
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
> #
> cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt))
> head(cntSum, 3)
  hour wday Freq
1    0 Mon    65
2    1 Mon    48
3    2 Mon    31
> #
> data <- merge(cntSum, freqSources)
> #
> data$wday <- factor(data$wday, levels = rev(levels(data$wday)))
> #
> data$Freq <- log2(data$Freq)
> p <- qplot(hour, wday, data = data, xlab = "", ylab = "",
+            geom = "point", colour = freqSource, size = Freq)
> p #             print(p)
whotwi
whotwi
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
> # whotwi theme
> theme_whotwi <- function() {
+     opts( #
+          panel.background = theme_rect(fill = NA, colour = NA),
+           #
+          legend.key = theme_rect(fill = NA, colour = NA),
+           #
+          axis.ticks = theme_segment(colour = NA))
+ }
> p2 <- p + theme_whotwi() + scale_size(legend = FALSE) +
scale_colour_hue(name = "")
> p2
whotwi
whotwi
whotwi
whotwi




         PC
whotwi



PC



         PC
Twitter


•
•             reshape2
•               ggplot2
•
RではじめるTwitter解析
RではじめるTwitter解析
TweetSentiments
TweetSentiments

R
1. RMeCab

2.

3.
RMeCab
    MeCab                      R

> library(RMeCab)
> (docDF(data.frame("                    "), column = 1, type = 1))
number of extracted terms = 5
now making a data frame. wait a while!

     TERM POS1   POS2 Row1
1                      1
2                       1
3                       1
4                       2
5                       2
http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html
          :               :        :1
      :           :           :0.999995
      :               :           :0.999979
          :           :           :0.999979
              :               :         :0.999645
      :               :            :0.999486
      :           :           :0.999314
...
> #
> pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/
pn_ja.dic",
+                     sep = ":",
+                     col.names = c("term", "kana", "pos", "value"),
+                     colClasses = c("character", "character", "factor",
"numeric"),
+                     fileEncoding = "Shift_JIS")
> #
> #
> pndic2 <- aggregate(value ~ term + pos, pndic, mean)
> # pndic
> pos <- unique(pndic2$pos)
> tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos)
number of extracted terms = 7164
now making a data frame. wait a while!

> tweetDF[2900:2904, 1:5]
         TERM   POS1 POS2 Row1 Row2
2900                      0    0
2901                         0       0
2902                     0       0
2903                         0       0
2904                         0       0
> # pndic
> tweetDF <- subset(tweetDF, TERM %in% pndic2$term)
> #
> tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c
("term", "pos"))
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> #
> score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value)
> #
> sum(score > 0)
[1] 117
> #
> sum(score < 0)
[1] 2765
> #
> sum(score == 0)
[1] 277
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> table(ifelse(pndic$value > 0, "positive",
+              ifelse(pndic$value == 0, "neutral", "negative")))

negative   neutral positive
   49983        20     5122
> m <- mean(score)
> #
> tweetType <- factor(ifelse(score > m, "positive",
+                     ifelse(score == m, "neutral", "negative")),
+                     levels = c("positive", "neutral", "negative"))
> table(tweetType)
tweetType
positive neutral negative
    1912        0     1247
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
> statusDF$tweetType <- droplevels(tweetType)
> #
> qplot(month, data = statusDF,
+       geom = "bar", fill = tweetType, position = "fill")
RではじめるTwitter解析
RではじめるTwitter解析
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
twitteR
•                                 RJSONIO
•
•             ID   status ID
• fav               favorited   TRUE
• truncated   TRUE
• DM
• status
  character   factor
OAuth   ”   ”   twitteR   -
RではじめるTwitter解析
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R
• twitteR
• reshape2       R


• ggplot2


• RMeCab     R

• PC
•
RではじめるTwitter解析
RではじめるTwitter解析
https://github.com/abicky/rjpusers2011_abicky
status
> statuses[[1]]$text
[1] "                                    "
> statuses[[1]]$getText() #
[1] "                                    "
> #
> statuses[[1]]$text <- "                    "
> statuses[[1]]$getText()
[1] "                                "
> statuses[[1]]$setText("ggrks") #
> statuses[[1]]$getText()
[1] "ggrks"
> #
> statuses[[1]]$getCreated()
[1] "2011-11-23 22:16:24 UTC"
removeSpecialStr

removeSpecialStr <- function(text) {
    removeURL(removeHashTag(removeScreenName(text)))
}
removeScreenName

removeScreenName <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<!w)[@ ](?>w+)(?![@ ])"
    } else {
        regex <- "[@   ]w+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeURL

removeURL <- function(text, strict = TRUE) {
    if (strict) {
        regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:.
+@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[-
w#%=+,.?!&~]*)*"
    } else {
        regex <- "https?://[-w#%=+,.?!&~/]+"
    }
    gsub(regex, "", text, perl = TRUE)
}
removeHashTag

removeHashTag <- function(text, strict = TRUE) {
    delimiters <- "s,.u3000-u3002uFF01uFF1F"
    # cf. http://nobu666.com/2011/07/13/914.html
    validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA
u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A
uFF66-uFF9E"
    if (strict) {
        regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?!
w))|[#   ][w%s]+)", delimiters, validJa, validJa)
    } else {
        regex <- sprintf("[#   ][^%s]+", delimiters)
    }
    gsub(regex, "12", text, perl = TRUE)
}
1 of 131

Recommended

twitteRで快適Rライフ! by
twitteRで快適Rライフ!twitteRで快適Rライフ!
twitteRで快適Rライフ!Takeshi Arabiki
8.4K views49 slides
Coding Horrors by
Coding HorrorsCoding Horrors
Coding HorrorsMark Baker
339 views70 slides
ZeroMQ Is The Answer: DPC 11 Version by
ZeroMQ Is The Answer: DPC 11 VersionZeroMQ Is The Answer: DPC 11 Version
ZeroMQ Is The Answer: DPC 11 VersionIan Barber
1.7K views50 slides
How to stand on the shoulders of giants by
How to stand on the shoulders of giantsHow to stand on the shoulders of giants
How to stand on the shoulders of giantsIan Barber
3.6K views32 slides
Palestra sobre Collections com Python by
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Pythonpugpe
1.5K views48 slides
ZeroMQ: Messaging Made Simple by
ZeroMQ: Messaging Made SimpleZeroMQ: Messaging Made Simple
ZeroMQ: Messaging Made SimpleIan Barber
4.6K views36 slides

More Related Content

What's hot

Is Haskell an acceptable Perl? by
Is Haskell an acceptable Perl?Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?osfameron
1.6K views87 slides
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython by
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonByterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonakaptur
2K views40 slides
Clustering com numpy e cython by
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cythonAnderson Dantas
1.4K views15 slides
Data monsters probablistic data structures by
Data monsters probablistic data structuresData monsters probablistic data structures
Data monsters probablistic data structuresGreenM
28 views30 slides
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha... by
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...akaptur
1.3K views48 slides
dplyr and torrents from cpasbien by
dplyr and torrents from cpasbiendplyr and torrents from cpasbien
dplyr and torrents from cpasbienRomain Francois
2.8K views32 slides

What's hot(20)

Is Haskell an acceptable Perl? by osfameron
Is Haskell an acceptable Perl?Is Haskell an acceptable Perl?
Is Haskell an acceptable Perl?
osfameron1.6K views
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython by akaptur
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPythonByterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
Byterun, a Python bytecode interpreter - Allison Kaptur at NYCPython
akaptur2K views
Clustering com numpy e cython by Anderson Dantas
Clustering com numpy e cythonClustering com numpy e cython
Clustering com numpy e cython
Anderson Dantas1.4K views
Data monsters probablistic data structures by GreenM
Data monsters probablistic data structuresData monsters probablistic data structures
Data monsters probablistic data structures
GreenM28 views
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha... by akaptur
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
akaptur1.3K views
dplyr and torrents from cpasbien by Romain Francois
dplyr and torrents from cpasbiendplyr and torrents from cpasbien
dplyr and torrents from cpasbien
Romain Francois2.8K views
The groovy puzzlers (as Presented at JavaOne 2014) by GroovyPuzzlers
The groovy puzzlers (as Presented at JavaOne 2014)The groovy puzzlers (as Presented at JavaOne 2014)
The groovy puzzlers (as Presented at JavaOne 2014)
GroovyPuzzlers595 views
M12 random forest-part01 by Raman Kannan
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
Raman Kannan106 views
Bytes in the Machine: Inside the CPython interpreter by akaptur
Bytes in the Machine: Inside the CPython interpreterBytes in the Machine: Inside the CPython interpreter
Bytes in the Machine: Inside the CPython interpreter
akaptur1.8K views
M09-Cross validating-naive-bayes by Raman Kannan
M09-Cross validating-naive-bayesM09-Cross validating-naive-bayes
M09-Cross validating-naive-bayes
Raman Kannan143 views
第二讲 Python基礎 by juzihua1102
第二讲 Python基礎第二讲 Python基礎
第二讲 Python基礎
juzihua1102213 views
第二讲 预备-Python基礎 by anzhong70
第二讲 预备-Python基礎第二讲 预备-Python基礎
第二讲 预备-Python基礎
anzhong70113 views
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!... by akaptur
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!..."A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
akaptur3K views
(gentle (introduction Clojure)) by Guy Taylor
(gentle (introduction Clojure))(gentle (introduction Clojure))
(gentle (introduction Clojure))
Guy Taylor943 views
An Elephant of a Different Colour: Hack by Vic Metcalfe
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: Hack
Vic Metcalfe2.1K views
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling by Plotly
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
Plotly748 views
Top 10 php classic traps by Damien Seguy
Top 10 php classic trapsTop 10 php classic traps
Top 10 php classic traps
Damien Seguy515 views
MongoUK - PHP Development by Boxed Ice
MongoUK - PHP DevelopmentMongoUK - PHP Development
MongoUK - PHP Development
Boxed Ice931 views

Viewers also liked

Rによるテキストマイニングの一例 by
Rによるテキストマイニングの一例Rによるテキストマイニングの一例
Rによるテキストマイニングの一例LINE Corp.
6K views18 slides
TwitterのデータをRであれこれ by
TwitterのデータをRであれこれTwitterのデータをRであれこれ
TwitterのデータをRであれこれTakeshi Arabiki
13.5K views103 slides
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~ by
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~antibayesian 俺がS式だ
10.1K views20 slides
RでTwitterテキストマイニング by
RでTwitterテキストマイニングRでTwitterテキストマイニング
RでTwitterテキストマイニングYudai Shinbo
4.8K views43 slides
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜 by
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜Keiichiro Ono
9.3K views151 slides
さくっとはじめるテキストマイニング(R言語)  スタートアップ編 by
さくっとはじめるテキストマイニング(R言語)  スタートアップ編さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編Yutaka Shimada
26.9K views57 slides

Viewers also liked(16)

Rによるテキストマイニングの一例 by LINE Corp.
Rによるテキストマイニングの一例Rによるテキストマイニングの一例
Rによるテキストマイニングの一例
LINE Corp.6K views
TwitterのデータをRであれこれ by Takeshi Arabiki
TwitterのデータをRであれこれTwitterのデータをRであれこれ
TwitterのデータをRであれこれ
Takeshi Arabiki13.5K views
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~ by antibayesian 俺がS式だ
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
あんちべのすべらない話~俺のツイートがこんなにウケないはずがない~
RでTwitterテキストマイニング by Yudai Shinbo
RでTwitterテキストマイニングRでTwitterテキストマイニング
RでTwitterテキストマイニング
Yudai Shinbo4.8K views
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜 by Keiichiro Ono
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
「数字を見せろ」から「コードを見せろ」へ 〜過程の透明性を確保したデータ可視化を目指す〜
Keiichiro Ono9.3K views
さくっとはじめるテキストマイニング(R言語)  スタートアップ編 by Yutaka Shimada
さくっとはじめるテキストマイニング(R言語)  スタートアップ編さくっとはじめるテキストマイニング(R言語)  スタートアップ編
さくっとはじめるテキストマイニング(R言語)  スタートアップ編
Yutaka Shimada26.9K views
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~ by Yusuke Fukasawa
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
動画タグネットワーク分析を用いた ニコニコ動画における萌芽文化発見の試み ~”ゆっくり関連タグ”を例として~
Yusuke Fukasawa3.4K views
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析 by Yusuke Fukasawa
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
経済的寄与度を重視した橋梁管理手法の有効性に関するマルチエージェントシミュレーションによる分析
Yusuke Fukasawa623 views
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた by Yusuke Fukasawa
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
【ニコニコ動画】"マリオメーカー問題"のもたらした影響をネットワーク分析してみた
Yusuke Fukasawa4.9K views
料理レシピサービスにおける検索語の意味変化に関する分析 by Yusuke Fukasawa
料理レシピサービスにおける検索語の意味変化に関する分析料理レシピサービスにおける検索語の意味変化に関する分析
料理レシピサービスにおける検索語の意味変化に関する分析
Yusuke Fukasawa2.1K views
DeNAの報告書を可視化して雰囲気をつかむ by Yusuke Fukasawa
DeNAの報告書を可視化して雰囲気をつかむDeNAの報告書を可視化して雰囲気をつかむ
DeNAの報告書を可視化して雰囲気をつかむ
Yusuke Fukasawa1.4K views
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握 by Yusuke Fukasawa
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
ニコニコ動画における関連動画情報を用いたカテゴリ特徴の把握
Yusuke Fukasawa2K views
巨大な表を高速に扱うData.table について by Haruka Ozaki
巨大な表を高速に扱うData.table について巨大な表を高速に扱うData.table について
巨大な表を高速に扱うData.table について
Haruka Ozaki8.4K views
data.tableパッケージで大規模データをサクッと処理する by Shintaro Fukushima
data.tableパッケージで大規模データをサクッと処理するdata.tableパッケージで大規模データをサクッと処理する
data.tableパッケージで大規模データをサクッと処理する
Shintaro Fukushima18.3K views
LDA等のトピックモデル by Mathieu Bertin
LDA等のトピックモデルLDA等のトピックモデル
LDA等のトピックモデル
Mathieu Bertin12.5K views
トピックモデルの話 by kogecoo
トピックモデルの話トピックモデルの話
トピックモデルの話
kogecoo 28.4K views

Similar to RではじめるTwitter解析

Just in time (series) - KairosDB by
Just in time (series) - KairosDBJust in time (series) - KairosDB
Just in time (series) - KairosDBVictor Anjos
1.4K views26 slides
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」 by
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」Ken'ichi Matsui
2.2K views69 slides
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014) by
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Grace Yang
1.1K views38 slides
[1062BPY12001] Data analysis with R / week 2 by
[1062BPY12001] Data analysis with R / week 2[1062BPY12001] Data analysis with R / week 2
[1062BPY12001] Data analysis with R / week 2Kevin Chun-Hsien Hsu
158 views33 slides
Beyond PHP - it's not (just) about the code by
Beyond PHP - it's not (just) about the codeBeyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the codeWim Godden
9.1K views56 slides
Datamining r 1st by
Datamining r 1stDatamining r 1st
Datamining r 1stsesejun
400 views15 slides

Similar to RではじめるTwitter解析(20)

Just in time (series) - KairosDB by Victor Anjos
Just in time (series) - KairosDBJust in time (series) - KairosDB
Just in time (series) - KairosDB
Victor Anjos1.4K views
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」 by Ken'ichi Matsui
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
Ken'ichi Matsui2.2K views
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014) by Grace Yang
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Win-Win Search: Dual-Agent Stochastic Game in Session Search (SIGIR 2014)
Grace Yang1.1K views
Beyond PHP - it's not (just) about the code by Wim Godden
Beyond PHP - it's not (just) about the codeBeyond PHP - it's not (just) about the code
Beyond PHP - it's not (just) about the code
Wim Godden9.1K views
Datamining r 1st by sesejun
Datamining r 1stDatamining r 1st
Datamining r 1st
sesejun400 views
Functional programming in Swift by John Pham
Functional programming in SwiftFunctional programming in Swift
Functional programming in Swift
John Pham146 views
PRE: Datamining 2nd R by sesejun
PRE: Datamining 2nd RPRE: Datamining 2nd R
PRE: Datamining 2nd R
sesejun396 views
Datamining R 1st by sesejun
Datamining R 1stDatamining R 1st
Datamining R 1st
sesejun417 views
Refactoring to Macros with Clojure by Dmitry Buzdin
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin3.5K views
Gotcha! Ruby things that will come back to bite you. by David Tollmyr
Gotcha! Ruby things that will come back to bite you.Gotcha! Ruby things that will come back to bite you.
Gotcha! Ruby things that will come back to bite you.
David Tollmyr1K views
Text Mining of Twitter in Data Mining by Meghaj Mallick
Text Mining of Twitter in Data MiningText Mining of Twitter in Data Mining
Text Mining of Twitter in Data Mining
Meghaj Mallick67 views
R Programming: Export/Output Data In R by Rsquared Academy
R Programming: Export/Output Data In RR Programming: Export/Output Data In R
R Programming: Export/Output Data In R
Rsquared Academy1.7K views
第6回 関数とフロー制御 by Wataru Shito
第6回 関数とフロー制御第6回 関数とフロー制御
第6回 関数とフロー制御
Wataru Shito383 views

More from Takeshi Arabiki

開発の心得 by
開発の心得開発の心得
開発の心得Takeshi Arabiki
5K views18 slides
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜 by
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜Takeshi Arabiki
10.9K views31 slides
Introduction to Japanese Morphological Analysis by
Introduction to Japanese Morphological AnalysisIntroduction to Japanese Morphological Analysis
Introduction to Japanese Morphological AnalysisTakeshi Arabiki
16.8K views25 slides
R による文書分類入門 by
R による文書分類入門R による文書分類入門
R による文書分類入門Takeshi Arabiki
30.7K views59 slides
Rのデータ構造とメモリ管理 by
Rのデータ構造とメモリ管理Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理Takeshi Arabiki
15.2K views22 slides
HTML5 Canvas で学ぶアフィン変換 by
HTML5 Canvas で学ぶアフィン変換HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換Takeshi Arabiki
8.9K views29 slides

More from Takeshi Arabiki(15)

クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜 by Takeshi Arabiki
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
クックパッド特売情報 における自然言語処理 〜固有表現抽出を利用した検索システム〜
Takeshi Arabiki10.9K views
Introduction to Japanese Morphological Analysis by Takeshi Arabiki
Introduction to Japanese Morphological AnalysisIntroduction to Japanese Morphological Analysis
Introduction to Japanese Morphological Analysis
Takeshi Arabiki16.8K views
R による文書分類入門 by Takeshi Arabiki
R による文書分類入門R による文書分類入門
R による文書分類入門
Takeshi Arabiki30.7K views
Rのデータ構造とメモリ管理 by Takeshi Arabiki
Rのデータ構造とメモリ管理Rのデータ構造とメモリ管理
Rのデータ構造とメモリ管理
Takeshi Arabiki15.2K views
HTML5 Canvas で学ぶアフィン変換 by Takeshi Arabiki
HTML5 Canvas で学ぶアフィン変換HTML5 Canvas で学ぶアフィン変換
HTML5 Canvas で学ぶアフィン変換
Takeshi Arabiki8.9K views
Introduction to Favmemo for Immature Engineers by Takeshi Arabiki
Introduction to Favmemo for Immature EngineersIntroduction to Favmemo for Immature Engineers
Introduction to Favmemo for Immature Engineers
Takeshi Arabiki2.9K views
Rのスコープとフレームと環境と by Takeshi Arabiki
Rのスコープとフレームと環境とRのスコープとフレームと環境と
Rのスコープとフレームと環境と
Takeshi Arabiki3.2K views
Rデータフレーム自由自在 by Takeshi Arabiki
Rデータフレーム自由自在Rデータフレーム自由自在
Rデータフレーム自由自在
Takeshi Arabiki24.3K views
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜 by Takeshi Arabiki
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
文字列カーネルによる辞書なしツイート分類 〜文字列カーネル入門〜
Takeshi Arabiki9.5K views
はじめてのまっぷりでゅ〜す by Takeshi Arabiki
はじめてのまっぷりでゅ〜すはじめてのまっぷりでゅ〜す
はじめてのまっぷりでゅ〜す
Takeshi Arabiki2.3K views
Twitterのデータを取得する準備 by Takeshi Arabiki
Twitterのデータを取得する準備Twitterのデータを取得する準備
Twitterのデータを取得する準備
Takeshi Arabiki2.8K views

Recently uploaded

virtual reality.pptx by
virtual reality.pptxvirtual reality.pptx
virtual reality.pptxG036GaikwadSnehal
14 views15 slides
TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f... by
TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...
TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...TrustArc
11 views29 slides
Piloting & Scaling Successfully With Microsoft Viva by
Piloting & Scaling Successfully With Microsoft VivaPiloting & Scaling Successfully With Microsoft Viva
Piloting & Scaling Successfully With Microsoft VivaRichard Harbridge
12 views160 slides
Democratising digital commerce in India-Report by
Democratising digital commerce in India-ReportDemocratising digital commerce in India-Report
Democratising digital commerce in India-ReportKapil Khandelwal (KK)
18 views161 slides
PRODUCT LISTING.pptx by
PRODUCT LISTING.pptxPRODUCT LISTING.pptx
PRODUCT LISTING.pptxangelicacueva6
14 views1 slide
Design Driven Network Assurance by
Design Driven Network AssuranceDesign Driven Network Assurance
Design Driven Network AssuranceNetwork Automation Forum
15 views42 slides

Recently uploaded(20)

TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f... by TrustArc
TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...
TrustArc Webinar - Managing Online Tracking Technology Vendors_ A Checklist f...
TrustArc11 views
Piloting & Scaling Successfully With Microsoft Viva by Richard Harbridge
Piloting & Scaling Successfully With Microsoft VivaPiloting & Scaling Successfully With Microsoft Viva
Piloting & Scaling Successfully With Microsoft Viva
Automating a World-Class Technology Conference; Behind the Scenes of CiscoLive by Network Automation Forum
Automating a World-Class Technology Conference; Behind the Scenes of CiscoLiveAutomating a World-Class Technology Conference; Behind the Scenes of CiscoLive
Automating a World-Class Technology Conference; Behind the Scenes of CiscoLive
"Running students' code in isolation. The hard way", Yurii Holiuk by Fwdays
"Running students' code in isolation. The hard way", Yurii Holiuk "Running students' code in isolation. The hard way", Yurii Holiuk
"Running students' code in isolation. The hard way", Yurii Holiuk
Fwdays17 views
HTTP headers that make your website go faster - devs.gent November 2023 by Thijs Feryn
HTTP headers that make your website go faster - devs.gent November 2023HTTP headers that make your website go faster - devs.gent November 2023
HTTP headers that make your website go faster - devs.gent November 2023
Thijs Feryn22 views
STKI Israeli Market Study 2023 corrected forecast 2023_24 v3.pdf by Dr. Jimmy Schwarzkopf
STKI Israeli Market Study 2023   corrected forecast 2023_24 v3.pdfSTKI Israeli Market Study 2023   corrected forecast 2023_24 v3.pdf
STKI Israeli Market Study 2023 corrected forecast 2023_24 v3.pdf
【USB韌體設計課程】精選講義節錄-USB的列舉過程_艾鍗學院 by IttrainingIttraining
【USB韌體設計課程】精選講義節錄-USB的列舉過程_艾鍗學院【USB韌體設計課程】精選講義節錄-USB的列舉過程_艾鍗學院
【USB韌體設計課程】精選講義節錄-USB的列舉過程_艾鍗學院
Five Things You SHOULD Know About Postman by Postman
Five Things You SHOULD Know About PostmanFive Things You SHOULD Know About Postman
Five Things You SHOULD Know About Postman
Postman36 views

RではじめるTwitter解析

  • 1. R Twitter R 2011 (2011/11/26) @a_bicky
  • 2. • Takeshi Arabiki ‣ ‣ Twitter & : @a_bicky & id:a_bicky • R • http://d.hatena.ne.jp/a_bicky/
  • 3. R Osaka.R #4 Tokyo.R #16 Tsukuba.R #9 http://www.slideshare.net/abicky/twitterr http://www.slideshare.net/abicky/r-9034336 http://www.slideshare.net/abicky/r-10128090
  • 4.
  • 10. http://twilog.org/ http://twitraq.userlocal.jp/ http://whotwi.com/ http://tweetstats.com/
  • 11. http://twilog.org/ http://twitraq.userlocal.jp/ R http://whotwi.com/ http://tweetstats.com/
  • 12. Twitter • • reshape2 • ggplot2 •
  • 13. Twitter • • reshape2 • ggplot2 •
  • 15. twitteR twitteR > library(twitteR) # twitteR > # (twitteR 0.99.15 ) > Sys.setlocale("LC_TIME", "C") [1] "C" > # @a_bicky 3,200 RT > statuses <- userTimeline("a_bicky", n = 3200)
  • 16. status > # R5 > ls.str(statuses[[1]]) created : POSIXct[1:1], format: "2011-11-23 22:16:24" favorited : logi FALSE ↑ UTC id : chr "139467359571296256" initFields : Formal class 'refMethodDef' [package "methods"] with 5 slots initialize : Formal class 'refMethodDef' [package "methods"] with 5 slots replyToSID : chr(0) replyToSN : chr(0) replyToUID : chr(0) screenName : chr "a_bicky" ! Twitter statusSource : chr "<a href="http://sites.google.com/site/ yorufukurou/" rel="nofollow">YoruFukurou</a>" text : chr " " truncated : logi FALSE ↑
  • 17. > statusDF <- twListToDF(statuses) > str(statusDF, vec.len = 1) 'data.frame': 3159 obs. of 10 variables: $ text : chr " " ... ↑ $ favorited : logi FALSE ... $ replyToSN : logi NA ... $ created : POSIXct, format: "2011-11-23 22:16:24" ... $ truncated : logi FALSE ... ↑ UTC $ replyToSID : logi NA ... $ id : chr "139467359571296256" ... $ replyToUID : logi NA ... $ statusSource: chr "<a href="http://sites.google.com/ site/yorufukurou/" rel="nofollow">YoruFukurou</a>" ... $ screenName : chr "a_bicky" ...
  • 18. > wday.abb <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") > statusDF <- within(statusDF, { + attr(created, "tzone") <- "Asia/Tokyo" # JST + statusSource <- factor(gsub("<a .*?>(.*?)</a>", "1", statusSource)) # HTML + date <- factor(format(created, "%Y-%m-%d")) # + hour <- NULL; month <- NULL; year <- NULL; wday <- NULL + with(as.POSIXlt(created), { + hour <<- factor(hour) # + month <<- factor(mon + 1) # + year <<- factor(year + 1900) # + wday <<- factor((wday + 6) %% 7, labels = wday.abb) # + }) + textLength <- nchar(text) # + # , URL, + cleanText <- removeSpecialStr(text) + cleanTextLength <- nchar(cleanText) # URL + })
  • 19. > # Twitter > topSources <- names(head(sort(table(statusDF$statusSource), decreasing = TRUE), 5)) > statusDF <- within(statusDF, { + statusSource <- as.character(statusSource) + statusSource[!statusSource %in% topSources] <- "other" + # + statusSource <- factor(statusSource, levels = names(sort(table (statusSource), dec = TRUE))) + })
  • 20. Twitter • • reshape2 • ggplot2 •
  • 23. Excel 9 11 ”Twitter for iPhone”, ”YoruFukurou” Sat Mon 12 23
  • 24. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 25. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000
  • 26. reshape2 > library(reshape2) > acast(melt(statusDF, id.vars = c("statusSource", "wday", "month", "hour"), + measure.vars = c("textLength")), + month + statusSource ~ wday, mean, + subset = .(statusSource %in% c("YoruFukurou", "Twitter for iPhone") + & month %in% 9:11 & hour %in% 12:23 + & wday %in% c("Mon", "Sat", "Sun"))) Mon Sat Sun 9_YoruFukurou 43 42.13333 54.76471 9_Twitter for iPhone 16 27.70000 20.50000 10_YoruFukurou 61 41.70175 56.98333 10_Twitter for iPhone NaN 27.00000 24.50000 11_YoruFukurou 35 41.08197 57.32609 11_Twitter for iPhone NaN NaN 32.00000 R
  • 27. reshape2 melt melt cast melt cast > mstatus <- melt(statusDF, + id.vars = c("statusSource", "wday", "year", "month", "hour", "date"), + measure.vars = c("textLength", "cleanTextLength")) > mstatus[3157:3162, ] statusSource wday year month hour date variable value 3157 web Sun 2011 3 20 2011-03-13 textLength 72 3158 web Sun 2011 3 16 2011-03-13 textLength 24 3159 web Sun 2011 3 14 2011-03-13 textLength 82 3160 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 87 3161 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 14 3162 YoruFukurou Wed 2011 11 1 2011-11-23 cleanTextLength 21 id
  • 28. reshape2 cast cast formula fun.aggregate > args(acast) # array acast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL > args(dcast) # data.frame dcast function (data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value_var = guess_value(data)) NULL formula ... . acast hoge ~ fuga ~ piyo ※dcast 1 hoge ~ fuga + piyo
  • 29. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) ↑ cleanTextLength
  • 30. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 31. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 >
  • 32. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength"))
  • 33. > # > acast(mstatus, . ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun [1,] 408 360 258 294 334 801 704 > # > acast(mstatus, hour ~ wday, length, subset = .(variable == "textLength")) Mon Tue Wed Thu Fri Sat Sun 0 65 69 26 46 46 49 40 1 48 19 11 15 27 44 37 2 31 24 6 16 17 23 17 3 27 19 4 11 14 17 10 4 4 15 1 7 4 5 7 5 5 11 1 4 3 4 5 6 4 14 3 6 9 8 1
  • 34. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength"))
  • 35. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0
  • 36. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength"))
  • 37. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 38. > # > # > acast(mstatus, hour ~ wday + month, length, subset = .(variable == "textLength")) Mon_3 Mon_4 Mon_5 Mon_6 Mon_7 Mon_8 Mon_9 Mon_10 Mon_11 Tue_3 Tue_4 0 3 4 13 3 1 10 7 15 9 4 2 1 0 0 1 0 1 9 16 12 9 1 0 2 2 0 0 0 2 7 6 7 7 2 0 > # 3 > acast(mstatus, hour ~ wday ~ month, length, subset = .(variable == "textLength")) , , 3 3 Mon Tue Wed Thu Fri Sat Sun 0 3 4 1 0 1 6 4 1 0 1 3 0 0 0 1
  • 42. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength"))
  • 43. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 44. Twitter reshape2 1 > # > dcast(mstatus, statusSource ~ ., + function(x) list(c(mean = mean(x), sd = sd(x))), + fill = list(c(mean = NaN, sd = NA)), ← + subset = .(variable == "textLength")) statusSource NA 1 YoruFukurou 47.51462, 32.57973 2 web 57.02720, 36.33534 3 Twitter for iPhone 33.42342, 23.06466 4 Twitter for Android 28.49048, 20.08457 5 Hatena 80.00000, 25.94212 6 other 52.58621, 33.12180 >
  • 45. > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 46. > # t > pc <- unlist(subset(statusDF, + statusSource %in% c("YoruFukurou", "web"), + textLength)) > sp <- unlist(subset(statusDF, + grepl("(iPhone|Android)", statusSource), + textLength)) > t.test(sp, pc, var.equal = FALSE) Welch Two Sample t-test !! data: sp and pc t = -15.7921, df = 1588.246, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -19.85334 -15.46645 sample estimates: mean of x mean of y 31.83945 49.49935
  • 47. > extractScreenNames <- function(text, strict = TRUE) { + if (strict) { + # Twitter screen_name + regex <- "(?:(?<!w)([@ ])((?>w+))(?![@ ])|[sS])" + } else { + # hoge@example.com + regex <- "(?:([@ ])(w+)|[sS])" + } + screenNames <- gsub(regex, "12", text, perl = TRUE) + unique(unlist(strsplit(substring(screenNames, 2), "[@ ]"))) + } > screenNames <- unlist(lapply(statusDF$text, extractScreenNames)) > head(sort(table(screenNames), decreasing = TRUE), 10) # Top 10 screenNames naopr __gfx__ hirota_inoue mandy_44 ask_a_lie 105 85 51 47 40 ken_nishi nokuno yokkuns JinJin0613 kanon19_rie 39 39 33 20 20
  • 48. Twitter • • reshape2 • ggplot2 •
  • 50. ggplot2 plot(statusDF$wday, col = "blue") ggplot2 qplot(wday, data = statusDF, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "")
  • 51. ggplot2 qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 52. ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 53. ggplot2 qplot(wday, data = statusDF, facets = ~ statusSource, fill = I("blue"), alpha = I(0.7), xlab = "", ylab = "") qplot(wday, data = statusDF, fill = statusSource, xlab = "", ylab = "")
  • 54. qplot ggplot2 > args(qplot) function (x, y = NULL, z = NULL, ..., data, facets = . ~ ., margins = FALSE, geom = "auto", stat = list(NULL), position = list(NULL), xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL, xlab = deparse(substitute(x)), ylab = deparse(substitute(y)), asp = NA) NULL
  • 55. qplot geom geom area: bar: histogram: line: point:
  • 56. qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "area", stat = "bin", fill = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 57. qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "bar", stat = "bin", fill = statusSource, xlab = "", ylab = "")
  • 58. qplot geom geom area: bar: histogram: line: point: qplot(as.integer(wday), data = statusDF, geom = "line", stat = "bin", colour = statusSource, xlab = "", ylab = "", binwidth = 1)
  • 59. qplot geom geom area: bar: histogram: line: point: qplot(wday, data = statusDF, geom = "point", stat = "bin", colour = statusSource, xlab = "", ylab = "")
  • 60. qplot position position geom dodge : fill : 1 jitter : stack :
  • 61. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "dodge", xlab = "", ylab = "")
  • 62. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "fill", xlab = "", ylab = "")
  • 63. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "jitter", xlab = "", ylab = "")
  • 64. qplot position position geom dodge : fill : 1 jitter : stack : qplot(wday, data = statusDF, fill = statusSource, position = "stack", xlab = "", ylab = "")
  • 65. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3
  • 66. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = ~ statusSource)
  • 67. qplot facets facets geom ~ : 1 ~ 2: 1, 2 ※reshape2 1 ~ 2 + 3 qplot(wday, data = statusDF, xlab = "", ylab = "", facets = month ~ statusSource)
  • 68. qplot alpha : colour (color) : fill : linetype : size : colour, fill, linetype statusSource fill = I("blue") I (AsIs)
  • 69. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", alpha = as.integer(wday))
  • 70. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", colour = statusSource)
  • 71. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", fill = statusSource)
  • 72. qplot alpha : colour (color) : fill : linetype : size : qplot(wday, data = statusDF, xlab = "", ylab = "", linetype = statusSource, colour = statusSource)
  • 75. whotwi http://whotwi.com/
  • 76. whotwi http://whotwi.com/
  • 77. whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25
  • 78. whotwi > # Twitter > # melt cast xtabs > cnt <- as.data.frame(xtabs(~ hour + wday + statusSource, statusDF)) > head(cnt, 3) hour wday statusSource Freq 1 0 Mon YoruFukurou 48 2 1 Mon YoruFukurou 38 3 2 Mon YoruFukurou 25 > freqSources <- by(cnt, cnt[c("hour", "wday")], function(df) { + # + freqSource <- with(df, statusSource[order(Freq, decreasing = TRUE) [1]]) + cbind(df[1, c("hour", "wday")], freqSource) + }) > freqSources <- do.call(rbind, freqSources) > head(freqSources, 3) hour wday freqSource 1 0 Mon YoruFukurou 2 1 Mon YoruFukurou 3 2 Mon YoruFukurou
  • 79. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31
  • 80. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq)
  • 81. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 82. whotwi > # > cntSum <- as.data.frame(xtabs(Freq ~ hour + wday, cnt)) > head(cntSum, 3) hour wday Freq 1 0 Mon 65 2 1 Mon 48 3 2 Mon 31 > # > data <- merge(cntSum, freqSources) > # > data$wday <- factor(data$wday, levels = rev(levels(data$wday))) > # > data$Freq <- log2(data$Freq) > p <- qplot(hour, wday, data = data, xlab = "", ylab = "", + geom = "point", colour = freqSource, size = Freq) > p # print(p)
  • 85. whotwi > # whotwi theme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 86. whotwi > # whotwi theme > theme_whotwi <- function() { + opts( # + panel.background = theme_rect(fill = NA, colour = NA), + # + legend.key = theme_rect(fill = NA, colour = NA), + # + axis.ticks = theme_segment(colour = NA)) + } > p2 <- p + theme_whotwi() + scale_size(legend = FALSE) + scale_colour_hue(name = "") > p2
  • 90. whotwi PC
  • 91. whotwi PC PC
  • 92. Twitter • • reshape2 • ggplot2 •
  • 98. RMeCab MeCab R > library(RMeCab) > (docDF(data.frame(" "), column = 1, type = 1)) number of extracted terms = 5 now making a data frame. wait a while! TERM POS1 POS2 Row1 1 1 2 1 3 1 4 2 5 2
  • 99. http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html : : :1 : : :0.999995 : : :0.999979 : : :0.999979 : : :0.999645 : : :0.999486 : : :0.999314 ...
  • 100. > # > pndic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/ pn_ja.dic", + sep = ":", + col.names = c("term", "kana", "pos", "value"), + colClasses = c("character", "character", "factor", "numeric"), + fileEncoding = "Shift_JIS") > # > # > pndic2 <- aggregate(value ~ term + pos, pndic, mean)
  • 101. > # pndic > pos <- unique(pndic2$pos) > tweetDF <- docDF(statusDF, column = "cleanText", type = 1, pos = pos) number of extracted terms = 7164 now making a data frame. wait a while! > tweetDF[2900:2904, 1:5] TERM POS1 POS2 Row1 Row2 2900 0 0 2901 0 0 2902 0 0 2903 0 0 2904 0 0 > # pndic > tweetDF <- subset(tweetDF, TERM %in% pndic2$term) > # > tweetDF <- merge(tweetDF, pndic2, by.x = c("TERM", "POS1"), by.y = c ("term", "pos"))
  • 102. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 103. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117
  • 104. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 105. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765
  • 106. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 107. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 108. > # > score <- colSums(tweetDF[4:(ncol(tweetDF) - 1)] * tweetDF$value) > # > sum(score > 0) [1] 117 > # > sum(score < 0) [1] 2765 > # > sum(score == 0) [1] 277
  • 109. > table(ifelse(pndic$value > 0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 110. > table(ifelse(pndic$value > 0, "positive", + ifelse(pndic$value == 0, "neutral", "negative"))) negative neutral positive 49983 20 5122
  • 111. > m <- mean(score) > # > tweetType <- factor(ifelse(score > m, "positive", + ifelse(score == m, "neutral", "negative")), + levels = c("positive", "neutral", "negative")) > table(tweetType) tweetType positive neutral negative 1912 0 1247
  • 112. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 113. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 114. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 115. > statusDF$tweetType <- droplevels(tweetType) > # > qplot(month, data = statusDF, + geom = "bar", fill = tweetType, position = "fill")
  • 118. twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 119. twitteR • RJSONIO • • ID status ID • fav favorited TRUE • truncated TRUE • DM • status character factor
  • 120. OAuth ” ” twitteR -
  • 122. • twitteR • reshape2 R • ggplot2 • RMeCab R
  • 123. • twitteR • reshape2 R • ggplot2 • RMeCab R • PC •
  • 127. status > statuses[[1]]$text [1] " " > statuses[[1]]$getText() # [1] " " > # > statuses[[1]]$text <- " " > statuses[[1]]$getText() [1] " " > statuses[[1]]$setText("ggrks") # > statuses[[1]]$getText() [1] "ggrks" > # > statuses[[1]]$getCreated() [1] "2011-11-23 22:16:24 UTC"
  • 128. removeSpecialStr removeSpecialStr <- function(text) { removeURL(removeHashTag(removeScreenName(text))) }
  • 129. removeScreenName removeScreenName <- function(text, strict = TRUE) { if (strict) { regex <- "(?<!w)[@ ](?>w+)(?![@ ])" } else { regex <- "[@ ]w+" } gsub(regex, "", text, perl = TRUE) }
  • 130. removeURL removeURL <- function(text, strict = TRUE) { if (strict) { regex <- "(?<![-.w#@=!'"/])https?://(?:[^:]+:. +@)?(?:[0-9A-Za-z][-0-9A-Za-z]*(?<!-).)+[A-za-z]+(?:/[- w#%=+,.?!&~]*)*" } else { regex <- "https?://[-w#%=+,.?!&~/]+" } gsub(regex, "", text, perl = TRUE) }
  • 131. removeHashTag removeHashTag <- function(text, strict = TRUE) { delimiters <- "s,.u3000-u3002uFF01uFF1F" # cf. http://nobu666.com/2011/07/13/914.html validJa <- "u3041-u3094u3099-u309Cu30A1-u30FA u30FCu3400-uD7A3uFF10-uFF19uFF21-uFF3AuFF41-uFF5A uFF66-uFF9E" if (strict) { regex <- sprintf("(^|[%s])(?:([# ](?>[0-9]+)(?! w))|[# ][w%s]+)", delimiters, validJa, validJa) } else { regex <- sprintf("[# ][^%s]+", delimiters) } gsub(regex, "12", text, perl = TRUE) }