> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " "
前回の神会




> head(talks_en)
# A tibble: 6 x 2
title_en transcript_en
<chr> <chr>
1 Fake videos of real people — and how to … Look at these images. Now…
2 How to build synthetic DNA and send it a… "Alright, let me tell you…
3 Technology that knows what you're feeling "What happens when techno…
4 How to get empowered, not overpowered, b… "After 13.8 billion years…
5 Where joy hides and how to find it "It's 2008, and I'm just …
6 ""You Found Me"" (Cello music starts) You …
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
# A tibble: 9,840 x 2
title_en word
<chr> <chr>
1 Fake videos of real people — and how to spot them look
2 Fake videos of real people — and how to spot them at
3 Fake videos of real people — and how to spot them these
4 Fake videos of real people — and how to spot them images
5 Fake videos of real people — and how to spot them now
6 Fake videos of real people — and how to spot them tell
7 Fake videos of real people — and how to spot them me
8 Fake videos of real people — and how to spot them which
9 Fake videos of real people — and how to spot them obama
10 Fake videos of real people — and how to spot them here
> talks_ja %>% head()
# A tibble: 6 x 2
title_ja transcript_ja
<chr> <chr>
1 … …
2 DNA … …
3 …
4 AI AI … 138 …
5 2008 …
6 You Found Me … …
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
からの罠
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
> mecab_result <- talks_ja %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 1
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
> mecab_result <- talks_ja %>%
+ as.data.frame() %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 2551
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
$ : Named chr [1:2903] " " " " " " " " ...
..- attr(*, "names")= chr [1:2903] " " " " " " " " ...
$ : Named chr [1:2208] " " " " " " " " ...
..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
> # tibble 1 tibble
> class(talks_ja[, "transcript_ja"])
[1] "tbl_df" "tbl" "data.frame"
> # 1
> length(talks_ja[, "transcript_ja"])
[1] 1
> # data.frame 1
> class(as.data.frame(talks_ja)[, "transcript_ja"])
[1] "character"
> #
> length(as.data.frame(talks_ja)[, "transcript_ja"])
[1] 2551
> tokens_ja <- purrr::pmap_df(list(nv = mecab_result,
+ title = talks_ja$title_ja),
+ function(nv, title){
+ tibble(title = title,
+ word = nv,
+ hinshi = names(nv))
+ })
> tokens_ja
# A tibble: 6,483,469 x 3
title word hinshi
<chr> <chr> <chr>
1
2
3
4
5
6
# ... with 6,483,463 more rows
> bigram_en <- talks_en %>% select(title_en, transcript_en) %>%
+ tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams",
n = 2)
> head(bigram_en)
# A tibble: 6 x 2
title_en bigram
<chr> <chr>
1 ""(Nothing But) Flowers" with string quartet" music here
2 ""(Nothing But) Flowers" with string quartet" here we
3 ""(Nothing But) Flowers" with string quartet" we stand
4 ""(Nothing But) Flowers" with string quartet" stand like
5 ""(Nothing But) Flowers" with string quartet" like an
6 ""(Nothing But) Flowers" with string quartet" an adam
> bigram_ja <- talks_ja %>%
+ as.data.frame() %>%
+ docDF(col = "transcript_ja", type=1, N = 2)
number of extracted terms = 898167
now making a data frame. wait a while!
> bigram_ja.bk %>%
+ select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7,
Row8, Row9) %>%
+ head()
TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9
1 !-( - 0 0 0 0 0 0 0 0 0
2 !-7 - 0 0 0 0 0 0 0 0 0
3 !-Google - 0 0 0 0 0 0 0 0 0
4 !-Little - 0 0 0 0 0 0 0 0 0
5 !-Time - 0 0 0 0 0 0 0 0 0
6 !-Toonchi - 0 0 0 0 0 0 0 0 0
> bigram_ja <- tokens_ja %>%
+ group_by(title) %>%
+ rename(word1 = word,
+ hinshi1 = hinshi) %>%
+ mutate(word2 = lead(word1),
+ hinshi2 = lead(hinshi1)) %>%
+ ungroup() %>%
+ filter(!is.na(word2)) %>%
+ select(title, word1, word2, hinshi1, hinshi2)
> bigram_ja
# A tibble: 6,480,920 x 5
title word1 word2 hinshi1 hinshi2
<chr> <chr> <chr> <chr> <chr>
1
2
3
4
5
6
7
8
# ... with 6,480,912 more rows




{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

  • 2.
    > me $name [1] "TakashiKitano" $twitter [1] "@kashitan" $work_in [1] " "
  • 3.
  • 5.
  • 7.
    > head(talks_en) # Atibble: 6 x 2 title_en transcript_en <chr> <chr> 1 Fake videos of real people — and how to … Look at these images. Now… 2 How to build synthetic DNA and send it a… "Alright, let me tell you… 3 Technology that knows what you're feeling "What happens when techno… 4 How to get empowered, not overpowered, b… "After 13.8 billion years… 5 Where joy hides and how to find it "It's 2008, and I'm just … 6 ""You Found Me"" (Cello music starts) You …
  • 8.
    > talks_en %>%tidytext::unnest_tokens(word, transcript_en)
  • 9.
    > talks_en %>%tidytext::unnest_tokens(word, transcript_en) # A tibble: 9,840 x 2 title_en word <chr> <chr> 1 Fake videos of real people — and how to spot them look 2 Fake videos of real people — and how to spot them at 3 Fake videos of real people — and how to spot them these 4 Fake videos of real people — and how to spot them images 5 Fake videos of real people — and how to spot them now 6 Fake videos of real people — and how to spot them tell 7 Fake videos of real people — and how to spot them me 8 Fake videos of real people — and how to spot them which 9 Fake videos of real people — and how to spot them obama 10 Fake videos of real people — and how to spot them here
  • 11.
    > talks_ja %>%head() # A tibble: 6 x 2 title_ja transcript_ja <chr> <chr> 1 … … 2 DNA … … 3 … 4 AI AI … 138 … 5 2008 … 6 You Found Me … …
  • 12.
    > talks_ja %>%tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 13.
  • 14.
    > talks_ja %>%tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 18.
    > mecab_result <-talks_ja %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 1 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
  • 19.
    > mecab_result <-talks_ja %>% + as.data.frame() %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 2551 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ... $ : Named chr [1:2903] " " " " " " " " ... ..- attr(*, "names")= chr [1:2903] " " " " " " " " ... $ : Named chr [1:2208] " " " " " " " " ... ..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
  • 20.
    > # tibble1 tibble > class(talks_ja[, "transcript_ja"]) [1] "tbl_df" "tbl" "data.frame" > # 1 > length(talks_ja[, "transcript_ja"]) [1] 1 > # data.frame 1 > class(as.data.frame(talks_ja)[, "transcript_ja"]) [1] "character" > # > length(as.data.frame(talks_ja)[, "transcript_ja"]) [1] 2551
  • 21.
    > tokens_ja <-purrr::pmap_df(list(nv = mecab_result, + title = talks_ja$title_ja), + function(nv, title){ + tibble(title = title, + word = nv, + hinshi = names(nv)) + })
  • 22.
    > tokens_ja # Atibble: 6,483,469 x 3 title word hinshi <chr> <chr> <chr> 1 2 3 4 5 6 # ... with 6,483,463 more rows
  • 24.
    > bigram_en <-talks_en %>% select(title_en, transcript_en) %>% + tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams", n = 2) > head(bigram_en) # A tibble: 6 x 2 title_en bigram <chr> <chr> 1 ""(Nothing But) Flowers" with string quartet" music here 2 ""(Nothing But) Flowers" with string quartet" here we 3 ""(Nothing But) Flowers" with string quartet" we stand 4 ""(Nothing But) Flowers" with string quartet" stand like 5 ""(Nothing But) Flowers" with string quartet" like an 6 ""(Nothing But) Flowers" with string quartet" an adam
  • 26.
    > bigram_ja <-talks_ja %>% + as.data.frame() %>% + docDF(col = "transcript_ja", type=1, N = 2) number of extracted terms = 898167 now making a data frame. wait a while!
  • 27.
    > bigram_ja.bk %>% +select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7, Row8, Row9) %>% + head() TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9 1 !-( - 0 0 0 0 0 0 0 0 0 2 !-7 - 0 0 0 0 0 0 0 0 0 3 !-Google - 0 0 0 0 0 0 0 0 0 4 !-Little - 0 0 0 0 0 0 0 0 0 5 !-Time - 0 0 0 0 0 0 0 0 0 6 !-Toonchi - 0 0 0 0 0 0 0 0 0
  • 29.
    > bigram_ja <-tokens_ja %>% + group_by(title) %>% + rename(word1 = word, + hinshi1 = hinshi) %>% + mutate(word2 = lead(word1), + hinshi2 = lead(hinshi1)) %>% + ungroup() %>% + filter(!is.na(word2)) %>% + select(title, word1, word2, hinshi1, hinshi2)
  • 30.
    > bigram_ja # Atibble: 6,480,920 x 5 title word1 word2 hinshi1 hinshi2 <chr> <chr> <chr> <chr> <chr> 1 2 3 4 5 6 7 8 # ... with 6,480,912 more rows
  • 32.