SlideShare a Scribd company logo
1 of 33
Download to read offline
> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " "
前回の神会




> head(talks_en)
# A tibble: 6 x 2
title_en transcript_en
<chr> <chr>
1 Fake videos of real people — and how to … Look at these images. Now…
2 How to build synthetic DNA and send it a… "Alright, let me tell you…
3 Technology that knows what you're feeling "What happens when techno…
4 How to get empowered, not overpowered, b… "After 13.8 billion years…
5 Where joy hides and how to find it "It's 2008, and I'm just …
6 ""You Found Me"" (Cello music starts) You …
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
# A tibble: 9,840 x 2
title_en word
<chr> <chr>
1 Fake videos of real people — and how to spot them look
2 Fake videos of real people — and how to spot them at
3 Fake videos of real people — and how to spot them these
4 Fake videos of real people — and how to spot them images
5 Fake videos of real people — and how to spot them now
6 Fake videos of real people — and how to spot them tell
7 Fake videos of real people — and how to spot them me
8 Fake videos of real people — and how to spot them which
9 Fake videos of real people — and how to spot them obama
10 Fake videos of real people — and how to spot them here
> talks_ja %>% head()
# A tibble: 6 x 2
title_ja transcript_ja
<chr> <chr>
1 … …
2 DNA … …
3 …
4 AI AI … 138 …
5 2008 …
6 You Found Me … …
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
からの罠
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
> mecab_result <- talks_ja %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 1
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
> mecab_result <- talks_ja %>%
+ as.data.frame() %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 2551
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
$ : Named chr [1:2903] " " " " " " " " ...
..- attr(*, "names")= chr [1:2903] " " " " " " " " ...
$ : Named chr [1:2208] " " " " " " " " ...
..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
> # tibble 1 tibble
> class(talks_ja[, "transcript_ja"])
[1] "tbl_df" "tbl" "data.frame"
> # 1
> length(talks_ja[, "transcript_ja"])
[1] 1
> # data.frame 1
> class(as.data.frame(talks_ja)[, "transcript_ja"])
[1] "character"
> #
> length(as.data.frame(talks_ja)[, "transcript_ja"])
[1] 2551
> tokens_ja <- purrr::pmap_df(list(nv = mecab_result,
+ title = talks_ja$title_ja),
+ function(nv, title){
+ tibble(title = title,
+ word = nv,
+ hinshi = names(nv))
+ })
> tokens_ja
# A tibble: 6,483,469 x 3
title word hinshi
<chr> <chr> <chr>
1
2
3
4
5
6
# ... with 6,483,463 more rows
> bigram_en <- talks_en %>% select(title_en, transcript_en) %>%
+ tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams",
n = 2)
> head(bigram_en)
# A tibble: 6 x 2
title_en bigram
<chr> <chr>
1 ""(Nothing But) Flowers" with string quartet" music here
2 ""(Nothing But) Flowers" with string quartet" here we
3 ""(Nothing But) Flowers" with string quartet" we stand
4 ""(Nothing But) Flowers" with string quartet" stand like
5 ""(Nothing But) Flowers" with string quartet" like an
6 ""(Nothing But) Flowers" with string quartet" an adam
> bigram_ja <- talks_ja %>%
+ as.data.frame() %>%
+ docDF(col = "transcript_ja", type=1, N = 2)
number of extracted terms = 898167
now making a data frame. wait a while!
> bigram_ja.bk %>%
+ select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7,
Row8, Row9) %>%
+ head()
TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9
1 !-( - 0 0 0 0 0 0 0 0 0
2 !-7 - 0 0 0 0 0 0 0 0 0
3 !-Google - 0 0 0 0 0 0 0 0 0
4 !-Little - 0 0 0 0 0 0 0 0 0
5 !-Time - 0 0 0 0 0 0 0 0 0
6 !-Toonchi - 0 0 0 0 0 0 0 0 0
> bigram_ja <- tokens_ja %>%
+ group_by(title) %>%
+ rename(word1 = word,
+ hinshi1 = hinshi) %>%
+ mutate(word2 = lead(word1),
+ hinshi2 = lead(hinshi1)) %>%
+ ungroup() %>%
+ filter(!is.na(word2)) %>%
+ select(title, word1, word2, hinshi1, hinshi2)
> bigram_ja
# A tibble: 6,480,920 x 5
title word1 word2 hinshi1 hinshi2
<chr> <chr> <chr> <chr> <chr>
1
2
3
4
5
6
7
8
# ... with 6,480,912 more rows




{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

More Related Content

What's hot

変分推論法(変分ベイズ法)(PRML第10章)
変分推論法(変分ベイズ法)(PRML第10章)変分推論法(変分ベイズ法)(PRML第10章)
変分推論法(変分ベイズ法)(PRML第10章)
Takao Yamanaka
 
相関と因果について考える:統計的因果推論、その(不)可能性の中心
相関と因果について考える:統計的因果推論、その(不)可能性の中心相関と因果について考える:統計的因果推論、その(不)可能性の中心
相関と因果について考える:統計的因果推論、その(不)可能性の中心
takehikoihayashi
 

What's hot (20)

構造方程式モデルによる因果推論: 因果構造探索に関する最近の発展
構造方程式モデルによる因果推論: 因果構造探索に関する最近の発展構造方程式モデルによる因果推論: 因果構造探索に関する最近の発展
構造方程式モデルによる因果推論: 因果構造探索に関する最近の発展
 
ベイズファクターとモデル選択
ベイズファクターとモデル選択ベイズファクターとモデル選択
ベイズファクターとモデル選択
 
変分推論法(変分ベイズ法)(PRML第10章)
変分推論法(変分ベイズ法)(PRML第10章)変分推論法(変分ベイズ法)(PRML第10章)
変分推論法(変分ベイズ法)(PRML第10章)
 
forestFloorパッケージを使ったrandomForestの感度分析
forestFloorパッケージを使ったrandomForestの感度分析forestFloorパッケージを使ったrandomForestの感度分析
forestFloorパッケージを使ったrandomForestの感度分析
 
『バックドア基準の入門』@統数研研究集会
『バックドア基準の入門』@統数研研究集会『バックドア基準の入門』@統数研研究集会
『バックドア基準の入門』@統数研研究集会
 
Stan超初心者入門
Stan超初心者入門Stan超初心者入門
Stan超初心者入門
 
データサイエンス概論第一 5 時系列データの解析
データサイエンス概論第一 5 時系列データの解析データサイエンス概論第一 5 時系列データの解析
データサイエンス概論第一 5 時系列データの解析
 
Rによるベイジアンネットワーク入門
Rによるベイジアンネットワーク入門Rによるベイジアンネットワーク入門
Rによるベイジアンネットワーク入門
 
相関と因果について考える:統計的因果推論、その(不)可能性の中心
相関と因果について考える:統計的因果推論、その(不)可能性の中心相関と因果について考える:統計的因果推論、その(不)可能性の中心
相関と因果について考える:統計的因果推論、その(不)可能性の中心
 
統計学基礎
統計学基礎統計学基礎
統計学基礎
 
ようやく分かった!最尤推定とベイズ推定
ようやく分かった!最尤推定とベイズ推定ようやく分かった!最尤推定とベイズ推定
ようやく分かった!最尤推定とベイズ推定
 
R seminar on igraph
R seminar on igraphR seminar on igraph
R seminar on igraph
 
Rで因子分析 商用ソフトで実行できない因子分析のあれこれ
Rで因子分析 商用ソフトで実行できない因子分析のあれこれRで因子分析 商用ソフトで実行できない因子分析のあれこれ
Rで因子分析 商用ソフトで実行できない因子分析のあれこれ
 
因果推論の奥へ: "What works" meets "why it works"
因果推論の奥へ: "What works" meets "why it works"因果推論の奥へ: "What works" meets "why it works"
因果推論の奥へ: "What works" meets "why it works"
 
Rの高速化
Rの高速化Rの高速化
Rの高速化
 
データサイエンス概論第一=2-1 データ間の距離と類似度
データサイエンス概論第一=2-1 データ間の距離と類似度データサイエンス概論第一=2-1 データ間の距離と類似度
データサイエンス概論第一=2-1 データ間の距離と類似度
 
計量経済学と 機械学習の交差点入り口 (公開用)
計量経済学と 機械学習の交差点入り口 (公開用)計量経済学と 機械学習の交差点入り口 (公開用)
計量経済学と 機械学習の交差点入り口 (公開用)
 
幾何を使った統計のはなし
幾何を使った統計のはなし幾何を使った統計のはなし
幾何を使った統計のはなし
 
因果探索: 基本から最近の発展までを概説
因果探索: 基本から最近の発展までを概説因果探索: 基本から最近の発展までを概説
因果探索: 基本から最近の発展までを概説
 
基礎からのベイズ統計学 輪読会資料 第4章 メトロポリス・ヘイスティングス法
基礎からのベイズ統計学 輪読会資料 第4章 メトロポリス・ヘイスティングス法基礎からのベイズ統計学 輪読会資料 第4章 メトロポリス・ヘイスティングス法
基礎からのベイズ統計学 輪読会資料 第4章 メトロポリス・ヘイスティングス法
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析

Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em Python
Denis Costa
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
pugpe
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
annikasarees
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析 (20)

PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
 
pa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processingpa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processing
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出す
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to Elixir
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em Python
 
Elixir
ElixirElixir
Elixir
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadies
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
 
R programming language
R programming languageR programming language
R programming language
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
 
Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1
 
Python 1
Python 1Python 1
Python 1
 
Py ohio
Py ohioPy ohio
Py ohio
 
Helvetia
HelvetiaHelvetia
Helvetia
 
Easy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraEasy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtra
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析
 
Delete statement in PHP
Delete statement in PHPDelete statement in PHP
Delete statement in PHP
 

More from Takashi Kitano

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
Takashi Kitano
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめ
Takashi Kitano
 

More from Takashi Kitano (12)

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
 
{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips
 
20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門
 
20150329 tokyo r47
20150329 tokyo r4720150329 tokyo r47
20150329 tokyo r47
 
20140920 tokyo r43
20140920 tokyo r4320140920 tokyo r43
20140920 tokyo r43
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめ
 

Recently uploaded

如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
wsppdmt
 
Abortion pills in Jeddah |+966572737505 | get cytotec
Abortion pills in Jeddah |+966572737505 | get cytotecAbortion pills in Jeddah |+966572737505 | get cytotec
Abortion pills in Jeddah |+966572737505 | get cytotec
Abortion pills in Riyadh +966572737505 get cytotec
 
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
LuisMiguelPaz5
 
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Klinik kandungan
 
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
acoha1
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
ahmedjiabur940
 
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotecAbortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
Abortion pills in Riyadh +966572737505 get cytotec
 
Simplify hybrid data integration at an enterprise scale. Integrate all your d...
Simplify hybrid data integration at an enterprise scale. Integrate all your d...Simplify hybrid data integration at an enterprise scale. Integrate all your d...
Simplify hybrid data integration at an enterprise scale. Integrate all your d...
varanasisatyanvesh
 
Reconciling Conflicting Data Curation Actions: Transparency Through Argument...
Reconciling Conflicting Data Curation Actions:  Transparency Through Argument...Reconciling Conflicting Data Curation Actions:  Transparency Through Argument...
Reconciling Conflicting Data Curation Actions: Transparency Through Argument...
Bertram Ludäscher
 
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
zifhagzkk
 
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
yulianti213969
 

Recently uploaded (20)

如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
 
Abortion pills in Jeddah |+966572737505 | get cytotec
Abortion pills in Jeddah |+966572737505 | get cytotecAbortion pills in Jeddah |+966572737505 | get cytotec
Abortion pills in Jeddah |+966572737505 | get cytotec
 
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTSDBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
DBMS UNIT 5 46 CONTAINS NOTES FOR THE STUDENTS
 
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
Identify Customer Segments to Create Customer Offers for Each Segment - Appli...
 
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
Huawei Ransomware Protection Storage Solution Technical Overview Presentation...
 
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
 
DAA Assignment Solution.pdf is the best1
DAA Assignment Solution.pdf is the best1DAA Assignment Solution.pdf is the best1
DAA Assignment Solution.pdf is the best1
 
SCI8-Q4-MOD11.pdfwrwujrrjfaajerjrajrrarj
SCI8-Q4-MOD11.pdfwrwujrrjfaajerjrajrrarjSCI8-Q4-MOD11.pdfwrwujrrjfaajerjrajrrarj
SCI8-Q4-MOD11.pdfwrwujrrjfaajerjrajrrarj
 
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
如何办理(WashU毕业证书)圣路易斯华盛顿大学毕业证成绩单本科硕士学位证留信学历认证
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
 
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotecAbortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
Abortion pills in Riyadh Saudi Arabia (+966572737505 buy cytotec
 
Predictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting TechniquesPredictive Precipitation: Advanced Rain Forecasting Techniques
Predictive Precipitation: Advanced Rain Forecasting Techniques
 
Simplify hybrid data integration at an enterprise scale. Integrate all your d...
Simplify hybrid data integration at an enterprise scale. Integrate all your d...Simplify hybrid data integration at an enterprise scale. Integrate all your d...
Simplify hybrid data integration at an enterprise scale. Integrate all your d...
 
Reconciling Conflicting Data Curation Actions: Transparency Through Argument...
Reconciling Conflicting Data Curation Actions:  Transparency Through Argument...Reconciling Conflicting Data Curation Actions:  Transparency Through Argument...
Reconciling Conflicting Data Curation Actions: Transparency Through Argument...
 
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
如何办理(Dalhousie毕业证书)达尔豪斯大学毕业证成绩单留信学历认证
 
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
obat aborsi Tarakan wa 081336238223 jual obat aborsi cytotec asli di Tarakan9...
 
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
 
jll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdfjll-asia-pacific-capital-tracker-1q24.pdf
jll-asia-pacific-capital-tracker-1q24.pdf
 
Digital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham WareDigital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham Ware
 
Ranking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRanking and Scoring Exercises for Research
Ranking and Scoring Exercises for Research
 

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

  • 1.
  • 2. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " "
  • 4.
  • 6.
  • 7. > head(talks_en) # A tibble: 6 x 2 title_en transcript_en <chr> <chr> 1 Fake videos of real people — and how to … Look at these images. Now… 2 How to build synthetic DNA and send it a… "Alright, let me tell you… 3 Technology that knows what you're feeling "What happens when techno… 4 How to get empowered, not overpowered, b… "After 13.8 billion years… 5 Where joy hides and how to find it "It's 2008, and I'm just … 6 ""You Found Me"" (Cello music starts) You …
  • 8. > talks_en %>% tidytext::unnest_tokens(word, transcript_en)
  • 9. > talks_en %>% tidytext::unnest_tokens(word, transcript_en) # A tibble: 9,840 x 2 title_en word <chr> <chr> 1 Fake videos of real people — and how to spot them look 2 Fake videos of real people — and how to spot them at 3 Fake videos of real people — and how to spot them these 4 Fake videos of real people — and how to spot them images 5 Fake videos of real people — and how to spot them now 6 Fake videos of real people — and how to spot them tell 7 Fake videos of real people — and how to spot them me 8 Fake videos of real people — and how to spot them which 9 Fake videos of real people — and how to spot them obama 10 Fake videos of real people — and how to spot them here
  • 10.
  • 11. > talks_ja %>% head() # A tibble: 6 x 2 title_ja transcript_ja <chr> <chr> 1 … … 2 DNA … … 3 … 4 AI AI … 138 … 5 2008 … 6 You Found Me … …
  • 12. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 14. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 15.
  • 16.
  • 17.
  • 18. > mecab_result <- talks_ja %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 1 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
  • 19. > mecab_result <- talks_ja %>% + as.data.frame() %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 2551 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ... $ : Named chr [1:2903] " " " " " " " " ... ..- attr(*, "names")= chr [1:2903] " " " " " " " " ... $ : Named chr [1:2208] " " " " " " " " ... ..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
  • 20. > # tibble 1 tibble > class(talks_ja[, "transcript_ja"]) [1] "tbl_df" "tbl" "data.frame" > # 1 > length(talks_ja[, "transcript_ja"]) [1] 1 > # data.frame 1 > class(as.data.frame(talks_ja)[, "transcript_ja"]) [1] "character" > # > length(as.data.frame(talks_ja)[, "transcript_ja"]) [1] 2551
  • 21. > tokens_ja <- purrr::pmap_df(list(nv = mecab_result, + title = talks_ja$title_ja), + function(nv, title){ + tibble(title = title, + word = nv, + hinshi = names(nv)) + })
  • 22. > tokens_ja # A tibble: 6,483,469 x 3 title word hinshi <chr> <chr> <chr> 1 2 3 4 5 6 # ... with 6,483,463 more rows
  • 23.
  • 24. > bigram_en <- talks_en %>% select(title_en, transcript_en) %>% + tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams", n = 2) > head(bigram_en) # A tibble: 6 x 2 title_en bigram <chr> <chr> 1 ""(Nothing But) Flowers" with string quartet" music here 2 ""(Nothing But) Flowers" with string quartet" here we 3 ""(Nothing But) Flowers" with string quartet" we stand 4 ""(Nothing But) Flowers" with string quartet" stand like 5 ""(Nothing But) Flowers" with string quartet" like an 6 ""(Nothing But) Flowers" with string quartet" an adam
  • 25.
  • 26. > bigram_ja <- talks_ja %>% + as.data.frame() %>% + docDF(col = "transcript_ja", type=1, N = 2) number of extracted terms = 898167 now making a data frame. wait a while!
  • 27. > bigram_ja.bk %>% + select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7, Row8, Row9) %>% + head() TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9 1 !-( - 0 0 0 0 0 0 0 0 0 2 !-7 - 0 0 0 0 0 0 0 0 0 3 !-Google - 0 0 0 0 0 0 0 0 0 4 !-Little - 0 0 0 0 0 0 0 0 0 5 !-Time - 0 0 0 0 0 0 0 0 0 6 !-Toonchi - 0 0 0 0 0 0 0 0 0
  • 28.
  • 29. > bigram_ja <- tokens_ja %>% + group_by(title) %>% + rename(word1 = word, + hinshi1 = hinshi) %>% + mutate(word2 = lead(word1), + hinshi2 = lead(hinshi1)) %>% + ungroup() %>% + filter(!is.na(word2)) %>% + select(title, word1, word2, hinshi1, hinshi2)
  • 30. > bigram_ja # A tibble: 6,480,920 x 5 title word1 word2 hinshi1 hinshi2 <chr> <chr> <chr> <chr> <chr> 1 2 3 4 5 6 7 8 # ... with 6,480,912 more rows
  • 31.