{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " "

> head(talks_en)
# A tibble: 6 x 2
title_en transcript_en
<chr> <chr>
1 Fake videos of real people — and how to … Look at these images. Now…
2 How to build synthetic DNA and send it a… "Alright, let me tell you…
3 Technology that knows what you're feeling "What happens when techno…
4 How to get empowered, not overpowered, b… "After 13.8 billion years…
5 Where joy hides and how to find it "It's 2008, and I'm just …
6 ""You Found Me"" (Cello music starts) You …

> talks_en %>% tidytext::unnest_tokens(word, transcript_en)

> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
# A tibble: 9,840 x 2
title_en word
<chr> <chr>
1 Fake videos of real people — and how to spot them look
2 Fake videos of real people — and how to spot them at
3 Fake videos of real people — and how to spot them these
4 Fake videos of real people — and how to spot them images
5 Fake videos of real people — and how to spot them now
6 Fake videos of real people — and how to spot them tell
7 Fake videos of real people — and how to spot them me
8 Fake videos of real people — and how to spot them which
9 Fake videos of real people — and how to spot them obama
10 Fake videos of real people — and how to spot them here

> talks_ja %>% head()
# A tibble: 6 x 2
title_ja transcript_ja
<chr> <chr>
1 … …
2 DNA … …
3 …
4 AI AI … 138 …
5 2008 …
6 You Found Me … …

> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows

> mecab_result <- talks_ja %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 1
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...

> mecab_result <- talks_ja %>%
+ as.data.frame() %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 2551
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
$ : Named chr [1:2903] " " " " " " " " ...
..- attr(*, "names")= chr [1:2903] " " " " " " " " ...
$ : Named chr [1:2208] " " " " " " " " ...
..- attr(*, "names")= chr [1:2208] " " " " " " " " ...

> # tibble 1 tibble
> class(talks_ja[, "transcript_ja"])
[1] "tbl_df" "tbl" "data.frame"
> # 1
> length(talks_ja[, "transcript_ja"])
[1] 1
> # data.frame 1
> class(as.data.frame(talks_ja)[, "transcript_ja"])
[1] "character"
> #
> length(as.data.frame(talks_ja)[, "transcript_ja"])
[1] 2551

> tokens_ja <- purrr::pmap_df(list(nv = mecab_result,
+ title = talks_ja$title_ja),
+ function(nv, title){
+ tibble(title = title,
+ word = nv,
+ hinshi = names(nv))
+ })

> tokens_ja
# A tibble: 6,483,469 x 3
title word hinshi
<chr> <chr> <chr>
1
2
3
4
5
6

> bigram_en <- talks_en %>% select(title_en, transcript_en) %>%
+ tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams",
n = 2)
> head(bigram_en)
# A tibble: 6 x 2
title_en bigram
<chr> <chr>
1 ""(Nothing But) Flowers" with string quartet" music here
2 ""(Nothing But) Flowers" with string quartet" here we
3 ""(Nothing But) Flowers" with string quartet" we stand
4 ""(Nothing But) Flowers" with string quartet" stand like
5 ""(Nothing But) Flowers" with string quartet" like an
6 ""(Nothing But) Flowers" with string quartet" an adam

> bigram_ja <- talks_ja %>%
+ as.data.frame() %>%
+ docDF(col = "transcript_ja", type=1, N = 2)
number of extracted terms = 898167
now making a data frame. wait a while!

> bigram_ja.bk %>%
+ select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7,
Row8, Row9) %>%
+ head()
TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9
1 !-( - 0 0 0 0 0 0 0 0 0
2 !-7 - 0 0 0 0 0 0 0 0 0
3 !-Google - 0 0 0 0 0 0 0 0 0
4 !-Little - 0 0 0 0 0 0 0 0 0
5 !-Time - 0 0 0 0 0 0 0 0 0
6 !-Toonchi - 0 0 0 0 0 0 0 0 0

> bigram_ja <- tokens_ja %>%
+ group_by(title) %>%
+ rename(word1 = word,
+ hinshi1 = hinshi) %>%
+ mutate(word2 = lead(word1),
+ hinshi2 = lead(hinshi1)) %>%
+ ungroup() %>%
+ filter(!is.na(word2)) %>%
+ select(title, word1, word2, hinshi1, hinshi2)

> bigram_ja
# A tibble: 6,480,920 x 5
title word1 word2 hinshi1 hinshi2
<chr> <chr> <chr> <chr> <chr>
1
2
3
4
5
6
7
8

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

More Related Content

What's hot

More from Takashi Kitano

Recently uploaded

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析