KTH_Detail day_화성에서 온 개발자 금성에서 온 기획자 시리즈_5차_데이터분석_조범석_20120613

데이터 를 통해 보는 우리와 세상과의 관계

푸딩 조범석

반쪽을 찾습니다
형 쌍둥이자리 태양인
개발자 푸딩랩
커피 핸드드립 년차 활자중독
자학개그

관계를 형성하기 까지

관계에 대한 기준 수립

데이터 수집

데이터 해석

결론

그것을 알아내기 위해선 무엇이 필요한가

정형 데이터

숫자
비 정형 데이터

텍스트 이미지 영상 등

투표소
지역 개표소
박후보 표
안후보 표
투표소 중앙 선관위
박후보 표
안후보 표

투표소
지역 개표소
박후보 표
안후보 표
투표소

package hadoopwordcount;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

public static class WordTokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>
{

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

/**
* map() gets a key, value, and context (which we'll ignore for the moment).
* key - seems to be "bytes from the beginning of the file"
* value - the current line; we are being fed one line at a time from the
* input file
*
* here's what the key and value look like if i print them out with the first
* println statement below:
*
* [map] key: (0), value: ([Weekly Compilation of Presidential Documents])
* [map] key: (47), value: (From the 2002 Presidential Documents Online via GPO Access [frwais.access.gpo.gov])
* [map] key: (130), value: ([DOCID:pd04fe02_txt-11] )
* [map] key: (179), value: ()
* [map] key: (180), value: ([Page 133-139])
*
* in the tokenizer loop, each token is a "word" from the current line, so the first token from
* the first line is "Weekly", then "Compilation", and so on. as a result, the output from the loop
* over the first line looks like this:
*
* [map] key: (0), value: ([Weekly Compilation of Presidential Documents])
* [map, in loop] token: ([Weekly)
* [map, in loop] token: (Compilation)
* [map, in loop] token: (of)
* [map, in loop] token: (Presidential)
* [map, in loop] token: (Documents])
*
*/
public void map(Object key,
! ! Text value,
! ! Context context)
throws IOException, InterruptedException
{
//System.err.println(String.format("[map] key: (%s), value: (%s)", key, value));
// break each sentence into words, using the punctuation characters shown
StringTokenizer tokenizer = new StringTokenizer(value.toString(), " tnrf,.:;?![]'");
while (tokenizer.hasMoreTokens())
{
// make the words lowercase so words like "an" and "An" are counted as one word
String s = tokenizer.nextToken().toLowerCase().trim();
System.err.println(String.format("[map, in loop] token: (%s)", s));

word.set(s);
context.write(word, one);
}
}
}

/**
* this is the reducer class.
* some magic happens before the data gets to us. the key and values data looks like this:
*
* [reduce] key: (Afghan), value: (1)
* [reduce] key: (Afghanistan), value: (1, 1, 1, 1, 1, 1, 1)
* [reduce] key: (Afghanistan,), value: (1, 1, 1)
* [reduce] key: (Africa), value: (1, 1)
* [reduce] key: (Al), value: (1)
*
* there are also many '0' values in the data:
*
* [reduce] key: (while), value: (0)
* [reduce] key: (who), value: (0)
* ...
*
* note that the input to this function is sorted, so it begins with numbers,
* like "000", then starts with "a", "about", and so on, after the numbers are printed.
*
*/
public static class WordOccurrenceReducer
extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable occurrencesOfWord = new IntWritable();

public void reduce(Text key,
! ! Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException
{
// debug output
//printKeyAndValues(key, values);
// the actual reducer work
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
occurrencesOfWord.set(sum);
// this writes the word and the count, like this: ("Africa", 2)
context.write(key, occurrencesOfWord);
// my debug output
System.err.println(String.format("[reduce] word: (%s), count: (%d)", key, occurrencesOfWord.get()));
}

// a little method to print debug output
private void printKeyAndValues(Text key, Iterable<IntWritable> values)
{
StringBuilder sb = new StringBuilder();
for (IntWritable val : values)
{
sb.append(val.get() + ", ");
}
System.err.println(String.format("[reduce] key: (%s), value: (%s)", key, sb.toString()));
}
}

/**
* the "driver" class. it sets everything up, then gets it started.
*/

public static void main(String[] args)
throws Exception
{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2)
{
System.err.println("Usage: wordcount <inputFile> <outputDir>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordTokenizerMapper.class);
job.setCombinerClass(WordOccurrenceReducer.class);
job.setReducerClass(WordOccurrenceReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

문 잘 짜야 빨리 처리 되듯
함수도 잘 짜야 합니다
그러기 위해선 어떻게 분석할지 잘 알아야 해요

물론 요즘에는 좀 더 쉽게 작성 할 수 있게 도와 주는
도구 들이 나오고 있습니다

유사 사용자 군집화

20~30

유사 행위 군집화

A

A G
Y

통계적 추론
가설 및 검증
기술 통계
탐색적 자료 분석

개발실 내부에 서비스 복제본 공개
로그 데이터 수집 시스템 구축 중
서비스 분석 진행 중

아임 사용자 푸시 광고

단골집 제외
기존 이벤트 장소 제외
사용자별 자주 가는 카테고리 및 장소 추출

추출된 사용자에게 추천 푸시 광고 집행


차 차 차
강남 한정식 청주 술집 꽃씨 이벤트

실험군 % % %

대조군 % % %
체크인 기준

꽃
김 춘수

내가 그의 이름을 불러 주기 전에는
그는 다만 하나의 몸짓에 지나지 않았다

내가 그의 이름을 불렀을 때
그는 나에게로 와서 꽃이 되었다

내가 그의 이름을 불러준 것처럼
나의 이 빛깔과 향기에 알맞는
누가 나의 이름을 불러다오

그에게로 가서 나도
그의 꽃이 되고 싶다

우리들은 모두 무엇이 되고 싶다
너는 나에게 나는 너에게
잊혀지지 않는 하나의 눈짓이 되고 싶다

KTH_Detail day_화성에서 온 개발자 금성에서 온 기획자 시리즈_5차_데이터분석_조범석_20120613

More Related Content

What's hot

More from KTH, 케이티하이텔

KTH_Detail day_화성에서 온 개발자 금성에서 온 기획자 시리즈_5차_데이터분석_조범석_20120613