●

●
●
●
●
○
○
○
○
●
●
●
●
●
●
●
●
…
●
●
●
●
●
CREATE EXTERNAL TABLE emails (
mid STRUCT<ts: TIMESTAMP, value: STRING>,
dateLong STRUCT<ts: TIMESTAMP, value: BIGINT>,
fr...
SELECT
fromStr.value AS fromStr,
count(1) AS count
FROM emails
GROUP BY fromStr.value
ORDER BY count DESC
LIMIT 10;
SELECT
fromStr.value AS fromStr,
trim(splitToStr) AS toStr,
count(1) AS count
FROM emails
LATERAL VIEW
explode(split(toStr...
●
●
●
○
○
Emails Table

Sentiment

User Emails

Producer
SELECT
((year(datelong.ts)-1999)*52+weekofyear(datelong.ts))
AS weeknum,
avg(sentiment.value) AS avgsentiment,
stddev(sent...
SELECT
lword AS word,
sum(sentiment) AS totalsentiment
FROM (
SELECT
mid.value AS mid,
lower(word) AS lword,
sentiment.val...
●
●
●

●
●
●
●
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Exploring the Enron Email Dataset with Kiji and Hive
Upcoming SlideShare
Loading in …5
×

Exploring the Enron Email Dataset with Kiji and Hive

1,729 views

Published on

Talk given at September 2013 SF Hadoop Users Group by Lee Sheng
http://www.meetup.com/hadoopsf/events/136499862/

Published in: Technology, Business
0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total views
1,729
On SlideShare
0
From Embeds
0
Number of Embeds
31
Actions
Shares
0
Downloads
51
Comments
0
Likes
0
Embeds 0
No embeds

No notes for slide

Exploring the Enron Email Dataset with Kiji and Hive

  1. 1. ● ● ● ●
  2. 2. ● ○ ○ ○ ○ ● ● ●
  3. 3. ● ● ● ● ●
  4. 4.
  5. 5. ● ● ● ● ●
  6. 6. CREATE EXTERNAL TABLE emails ( mid STRUCT<ts: TIMESTAMP, value: STRING>, dateLong STRUCT<ts: TIMESTAMP, value: BIGINT>, fromStr STRUCT<ts: TIMESTAMP, value: STRING>, toStr STRUCT<ts: TIMESTAMP, value: STRING>, subject STRUCT<ts: TIMESTAMP, value: STRING>, body STRUCT<ts: TIMESTAMP, value: STRING>, ) STORED BY 'org.kiji.hive.KijiTableStorageHandler' WITH SERDEPROPERTIES ( 'kiji.columns' = ‘info:mid[0],info:date[0],info:from[0],info:to[0],’ + ‘info:subject[0],info:body[0]’ ) TBLPROPERTIES ( 'kiji.table.uri' = ' kiji://.env/enron_email/emails ' );
  7. 7. SELECT fromStr.value AS fromStr, count(1) AS count FROM emails GROUP BY fromStr.value ORDER BY count DESC LIMIT 10;
  8. 8. SELECT fromStr.value AS fromStr, trim(splitToStr) AS toStr, count(1) AS count FROM emails LATERAL VIEW explode(split(toStr.value,',')) tos AS splitToStr GROUP BY fromStr.value,trim(splitToStr) ORDER BY count DESC LIMIT 10;
  9. 9. ● ● ● ○ ○
  10. 10. Emails Table Sentiment User Emails Producer
  11. 11. SELECT ((year(datelong.ts)-1999)*52+weekofyear(datelong.ts)) AS weeknum, avg(sentiment.value) AS avgsentiment, stddev(sentiment.value) AS stddevsentiment, count(1) AS nummessages FROM emails WHERE regexp_replace(fromStr.value,".*@","")=="enron.com" GROUP BY ((year(datelong.ts)-1999)*52+weekofyear(datelong. ts));
  12. 12. SELECT lword AS word, sum(sentiment) AS totalsentiment FROM ( SELECT mid.value AS mid, lower(word) AS lword, sentiment.value AS sentiment FROM emails LATERAL VIEW explode(sentences(body.value)[0]) wds AS word WHERE regexp_replace(fromStr.value,".*@","")=="enron.com" ) subquery GROUP BY lword ORDER BY totalsentiment ASC;
  13. 13. ● ● ● ●
  14. 14. ● ● ●

×