SlideShare a Scribd company logo
1 of 28
Download to read offline
BASICS OF DATA
 MUNGING IN R

      &
DATA MUNGING
OUR GOAL
•


•
    –
    –
    –
    –
    –
SOURCES:
LET’S GET DATA
DATA LOOKS LIKE THIS
TEXT INTO R
       > read.table()
•      > read.csv()

       # Read a table
       > url <-
       'http://robjhyndman.com/tsdldata/robert
       s/beards.dat'

       > read.table(url, header=FALSE,skip=4)
          V1
       1 20
•      2 24
       3 10
       4 21
       5 28
       … …

       # Read a CSV file
       > Y <- read.csv(filename, header=F)
PLAIN TEXT
DATA LOOKS LIKE THIS
MARKUP INTO R
                                                        •
> library(XML)
> url2 <-
‘http://www.faa.gov/data_research/passengers_cargo/un
ruly_passengers/’
> X <- readHTMLTable(url2, header=T,
stringsAsFactors=FALSE)[[1]]
> X
   Year                   Total

                                                        •
1 1995                      146
2 1996                      184
3 1997                      235
4 1998
5 1999
                            200
                            226                             –
6 2000                      227
7 2001                      300
8 2002                      306
9 2003                      302                             –
10 2004                     330
11 2005
12 2006
                            226
                            156
                                                            –
13 2007
14 2008
                            176
                            134                             –
15 2009                     176
16 2010                     148
17 2011                     131
18 2012 12 as of April 10, 2012                             –
HTML
FROM OTHER LANGUAGES
> library(xlsx)
> library(foreign)
                      •
                      •
# SAS
> read.xport(file)

# Stata
> read.dta(file)      •
# SPSS
> read.spss(file)         –
                          –
# Matlab
> read.octave(file)
                          –
                          –
# minitab
> read.mtp(file)
                          –
                          –
                          –
CUSTOM PACKAGES
              >   library(quantmod)
•             >   library(twitteR)
              >   library(RNYTimes)
              >   library(RClimate)


              > getSymbols("GOOG")
              [1] "GOOG"

•             >
              searchTwitter('#ilovestatistics'
    –         ,n=10)[[2]]

              [1]"Statistics: the best kind of
              homework #ilovestatistics #nerd
              #shouldhavebeenastatistician
    –         #gradschoolproblems"
BUILDING AND
EXPLORING DATA
TYPES OF DATA
•           >
            >
                a
                b
                    <-
                    <-
                         c(1,2,3,4)
                         matrix(c(1,2,3,4), nrow=2)
    –       >   c   <-   list("a"="fred", "b"="bill")
    –       >   d   <-   data.frame(b)

    –       > a # VECTOR
    –       [1] 1 2 3 4

•           > b # MATRIX
                 [,1] [,2]
            [1,]    1    3
            [2,]    2    4

            > c # LIST (Note the key-value structure)
•           $a
            [1] "fred"
            $b
            [1] "bill"
    –
            > d # DATA FRAME
              X1 X2
            1 1 3
            2 2 4
CONVERSION AND COERCION
              > as.data.frame(a)
•               a
              1 1
    –         2 2
              3 3
              4 4
              > data.frame(a)
                a
    –         1 1
              2 2
              3 3
        •     4 4

              > as.matrix(a, nrow=2) # WATCH OUT
        •          [,1]
              [1,]    1
              [2,]    2
    –         [3,]    3
              [4,]    4
              > matrix(a, nrow=2)    # THIS INSTEAD!
                   [,1] [,2]
              [1,]    1    3
              [2,]    2    4
VARIABLE INTERROGATION
                 > Y <- runif(200)
•                > str(Y)
                 num [1:200] 0.5053 0.3564 0.0359 0.7377
                 0.0302 ...

                 > head(Y) # GIVE ME THE FIRST 5
                 [1] 0.50525553 0.35636648 0.03589792
•                0.73766891 0.03020607 0.50628327

    –            > tail(Y) # GIVE ME THE LAST 5
                 [1] 0.6612501 0.9930194 0.8392855
    –            0.5459498 0.2587155 0.3704778

    –            > dim(Y) # NOPE! HE IS A VECTOR
                 NULL
    –
                 > length(Y)
                 [1] 200
INDEXES



A[m,n]

A[ ,n]
USING INDEXES
•                                                       > b
                                                             [,1] [,2]
•                                                       [1,]    1    3
•                                                       [2,]    2    4

                                                        > b[,1]   # ALL ROWS, FIRST COLUMN
•                                                       [1] 1 2
•
                                                        > b[2,]   # SECOND ROW, ALL COLUMNS
                                                        [1] 2 4
    > head(unemp)
       rank     region Aug. 2012 Sept. 201 change       > b[-1,] # ALL ROWS EXCEPT 1
    14   14    alabama       8.5       8.3   -0.2
                                                        [1] 2 4
    15   14     alaska       7.7       7.5   -0.2
    27   27    arizona       8.3       8.2   -0.1
    16   14   arkansas       7.3       7.1   -0.2       > b>3
    2     2 california      10.6      10.2   -0.4             [,1] [,2]
    17   14   colorado       8.2       8.0   -0.2       [1,] FALSE FALSE
                                                        [2,] FALSE TRUE
    > unemp[unemp[4]>10,]
       rank       region Aug.   2012 Sept. 201 change
    2     2   california        10.6      10.2   -0.4   > GOOG[GOOG[,6]>768.00,6]
    11    6       nevada        12.1      11.8   -0.3              GOOG.Adjusted
    24   14 rhode island        10.7      10.5   -0.2   2012-10-04        768.05
USE NAMES INSTEAD: $
                                           > X <- 1
•                                          > X$name <- "Fred"
                                           Warning message:
                                           In X$name <- "Fred" : Coercing LHS
                                           to a list
                                           > X$occupation <- "Doctor"
                                           > X$age <- 21
     –
> name <- c("Fred", "Bill")                > X
> occupation <- c("Doctor", "Dancer")      [[1]]
> people <- data.frame(name, occupation)   [1] 1

> people
                                           $name
  name     occupation
                                           [1] "Fred"
1 Fred         Doctor
2 Bill         Dancer
                                           $occupation
> people$age <- 35                         [1] "Doctor"
> people
  name      occupation age                 $age
1 Fred          Doctor 35                  [1] 21
2 Bill          Dancer 35

people[people$name=="Fred",]$age=40
                                           > X$name == X[2]
MORE STRUCTURE



cbind(a,b)
rbind(a,b)
TRANSFORMING DATA
SPEAK LIKE A NATIVE
•                  > mymatrix <-
                   matrix(rep(seq(2,6,by=2), 3),
                   ncol = 3)
    –
        •          > mymatrix
                        [,1] [,2] [,3]
    –              [1,]    2    2    2
        •          [2,]    4    4    4
                   [3,]    6    6    6
    –
        •          > apply(mymatrix, 1, sum)
                   [1] 6 12 18
    –
        •          > apply(mymatrix, 2, sum)
                   [1] 12 12 12
LAPPLY
•        > lapply(mymatrix[,1],sum)
         [[1]]
         [1] 2

         [[2]]
•        [1] 4

         [[3]]
         [1] 6
    –
    –    > sapply(mymatrix[,1],sum)
    –    [1] 2 4 6
LONG TO WIDE
Language Skill Users
1        R High     10
2        R   Med     10
3        R   Low     10
4      SAS High       1
5      SAS   Med    25
6      SAS   Low     20

Language Users.High Users.Med Users.Low
1        R         10        10        10
4      SAS          1        25        20
RESHAPE GYMNASTICS
                                         > df <-
•                                        data.frame(c("R","R","R","SAS","SAS","SAS"),
                                         c("High","Med","Low","High","Med","Low"),
                                         c(10,5,10,1,25,20)); colnames(df) <-
       –                                 c("Language","Skill","Users")

                                         > df
                                           Language Skill Users
                                         1        R High     10
                                         2        R   Med    10
       –                                 3        R   Low    10
                                         4      SAS High      1

•                                        5
                                         6
                                                SAS
                                                SAS
                                                      Med
                                                      Low
                                                             25
                                                             20

                                         > reshape(df, idvar="Language",
                                         timevar="Skill", direction="wide")
    > reshape(df2, direction="long")       Language Users.High Users.Med Users.Low
             Language Skill Users.High
    R.High          R High          10
                                         1        R         10        10        10
    SAS.High      SAS High           1   4      SAS          1        25        20
    R.Med           R   Med         10
    SAS.Med       SAS   Med         25
    R.Low           R   Low         10
    SAS.Low       SAS   Low         20

    > df3[order(df3$Language),]
NEW VARIABLES IN-PLACE
                 > head(mtcars)[1]
•                Mazda RX4
                                    mpg
                                   21.0
                 Mazda RX4 Wag     21.0
                 Datsun 710        22.8
    –            Hornet 4 Drive    21.4
                 Hornet Sportabout 18.7
                 Valiant           18.1
    –
                 > head(with(mtcars, mpg*10)) # NEW VECTOR
•                [1] 210 210 228 214 187 181


    –            > head(transform(mtcars,
                 electricdreams=mpg*10))[c(1,12)]
                                    mpg electricdreams
    –            Mazda RX4         21.0            210
                 Mazda RX4 Wag     21.0            210
    –            Datsun 710        22.8            228
                 Hornet 4 Drive    21.4            214
                 Hornet Sportabout 18.7            187
                 Valiant           18.1            181
MASH UP




> head(unemp)
      region rank Aug. 2012 Sept. 201 change DEV state_code State Abbreviation
1    alabama   14       8.5       8.3   -0.2 0.4         01                 AL
2     alaska   14       7.7       7.5   -0.2 -0.4        02                 AK
3    arizona   27       8.3       8.2   -0.1 0.3         04                 AZ
4   arkansas   14       7.3       7.1   -0.2 -0.8        05                 AR
5 california    2      10.6      10.2   -0.4 2.3         06                 CA
6   colorado   14       8.2       8.0   -0.2 0.1         08                 CO
THANKS
•
•

More Related Content

What's hot

Invertible-syntax 入門
Invertible-syntax 入門Invertible-syntax 入門
Invertible-syntax 入門Hiromi Ishii
 
Metaprogramming in Haskell
Metaprogramming in HaskellMetaprogramming in Haskell
Metaprogramming in HaskellHiromi Ishii
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsCheng Min Chi
 
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...Ontico
 
Data profiling with Apache Calcite
Data profiling with Apache CalciteData profiling with Apache Calcite
Data profiling with Apache CalciteJulian Hyde
 
Drools5 Community Training Module 3 Drools Expert DRL Syntax
Drools5 Community Training Module 3 Drools Expert DRL SyntaxDrools5 Community Training Module 3 Drools Expert DRL Syntax
Drools5 Community Training Module 3 Drools Expert DRL SyntaxMauricio (Salaboy) Salatino
 
Django101 geodjango
Django101 geodjangoDjango101 geodjango
Django101 geodjangoCalvin Cheng
 
Saving Gaia with GeoDjango
Saving Gaia with GeoDjangoSaving Gaia with GeoDjango
Saving Gaia with GeoDjangoCalvin Cheng
 
Slick: Bringing Scala’s Powerful Features to Your Database Access
Slick: Bringing Scala’s Powerful Features to Your Database Access Slick: Bringing Scala’s Powerful Features to Your Database Access
Slick: Bringing Scala’s Powerful Features to Your Database Access Rebecca Grenier
 
Switching from java to groovy
Switching from java to groovySwitching from java to groovy
Switching from java to groovyPaul Woods
 
Nik Graf - Get started with Reason and ReasonReact
Nik Graf - Get started with Reason and ReasonReactNik Graf - Get started with Reason and ReasonReact
Nik Graf - Get started with Reason and ReasonReactOdessaJS Conf
 
Deep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesDeep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesIbrar Ahmed
 
関数潮流(Function Tendency)
関数潮流(Function Tendency)関数潮流(Function Tendency)
関数潮流(Function Tendency)riue
 
Introducción rápida a SQL
Introducción rápida a SQLIntroducción rápida a SQL
Introducción rápida a SQLCarlos Hernando
 
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...PROIDEA
 
TDC218SP | Trilha Kotlin - DSLs in a Kotlin Way
TDC218SP | Trilha Kotlin - DSLs in a Kotlin WayTDC218SP | Trilha Kotlin - DSLs in a Kotlin Way
TDC218SP | Trilha Kotlin - DSLs in a Kotlin Waytdc-globalcode
 
Drools5 Community Training HandsOn 1 Drools DRL Syntax
Drools5 Community Training HandsOn 1 Drools DRL SyntaxDrools5 Community Training HandsOn 1 Drools DRL Syntax
Drools5 Community Training HandsOn 1 Drools DRL SyntaxMauricio (Salaboy) Salatino
 

What's hot (20)

Codigos
CodigosCodigos
Codigos
 
Invertible-syntax 入門
Invertible-syntax 入門Invertible-syntax 入門
Invertible-syntax 入門
 
Metaprogramming in Haskell
Metaprogramming in HaskellMetaprogramming in Haskell
Metaprogramming in Haskell
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...
CREATE INDEX … USING VODKA. VODKA CONNECTING INDEXES, Олег Бартунов, Александ...
 
Data profiling with Apache Calcite
Data profiling with Apache CalciteData profiling with Apache Calcite
Data profiling with Apache Calcite
 
Drools5 Community Training Module 3 Drools Expert DRL Syntax
Drools5 Community Training Module 3 Drools Expert DRL SyntaxDrools5 Community Training Module 3 Drools Expert DRL Syntax
Drools5 Community Training Module 3 Drools Expert DRL Syntax
 
The PostgreSQL JSON Feature Tour
The PostgreSQL JSON Feature TourThe PostgreSQL JSON Feature Tour
The PostgreSQL JSON Feature Tour
 
Django101 geodjango
Django101 geodjangoDjango101 geodjango
Django101 geodjango
 
Saving Gaia with GeoDjango
Saving Gaia with GeoDjangoSaving Gaia with GeoDjango
Saving Gaia with GeoDjango
 
Slick: Bringing Scala’s Powerful Features to Your Database Access
Slick: Bringing Scala’s Powerful Features to Your Database Access Slick: Bringing Scala’s Powerful Features to Your Database Access
Slick: Bringing Scala’s Powerful Features to Your Database Access
 
Switching from java to groovy
Switching from java to groovySwitching from java to groovy
Switching from java to groovy
 
Nik Graf - Get started with Reason and ReasonReact
Nik Graf - Get started with Reason and ReasonReactNik Graf - Get started with Reason and ReasonReact
Nik Graf - Get started with Reason and ReasonReact
 
Deep dive to PostgreSQL Indexes
Deep dive to PostgreSQL IndexesDeep dive to PostgreSQL Indexes
Deep dive to PostgreSQL Indexes
 
関数潮流(Function Tendency)
関数潮流(Function Tendency)関数潮流(Function Tendency)
関数潮流(Function Tendency)
 
Introducción rápida a SQL
Introducción rápida a SQLIntroducción rápida a SQL
Introducción rápida a SQL
 
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
 
TDC218SP | Trilha Kotlin - DSLs in a Kotlin Way
TDC218SP | Trilha Kotlin - DSLs in a Kotlin WayTDC218SP | Trilha Kotlin - DSLs in a Kotlin Way
TDC218SP | Trilha Kotlin - DSLs in a Kotlin Way
 
Drools5 Community Training HandsOn 1 Drools DRL Syntax
Drools5 Community Training HandsOn 1 Drools DRL SyntaxDrools5 Community Training HandsOn 1 Drools DRL Syntax
Drools5 Community Training HandsOn 1 Drools DRL Syntax
 

Similar to Basics of data munging in R

PRE: Datamining 2nd R
PRE: Datamining 2nd RPRE: Datamining 2nd R
PRE: Datamining 2nd Rsesejun
 
Datamining R 1st
Datamining R 1stDatamining R 1st
Datamining R 1stsesejun
 
Datamining r 1st
Datamining r 1stDatamining r 1st
Datamining r 1stsesejun
 
Introduction to R
Introduction to RIntroduction to R
Introduction to RStacy Irwin
 
Extending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::UtilExtending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::UtilNova Patch
 
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲Mohammad Reza Kamalifard
 
R for Pirates. ESCCONF October 27, 2011
R for Pirates. ESCCONF October 27, 2011R for Pirates. ESCCONF October 27, 2011
R for Pirates. ESCCONF October 27, 2011Mandi Walls
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)Wataru Shito
 
useR!2010 matome
useR!2010 matomeuseR!2010 matome
useR!2010 matomeybenjo
 
Datamining r 4th
Datamining r 4thDatamining r 4th
Datamining r 4thsesejun
 
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)Wataru Shito
 
Datamining R 4th
Datamining R 4thDatamining R 4th
Datamining R 4thsesejun
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出しWataru Shito
 
R v01 rprogamming_basic01 (R 프로그래밍 기본)
R v01 rprogamming_basic01 (R 프로그래밍 기본)R v01 rprogamming_basic01 (R 프로그래밍 기본)
R v01 rprogamming_basic01 (R 프로그래밍 기본)BuskersBu
 
Getting started with R when analysing GitHub commits
Getting started with R when analysing GitHub commitsGetting started with R when analysing GitHub commits
Getting started with R when analysing GitHub commitsBarbara Fusinska
 

Similar to Basics of data munging in R (20)

PRE: Datamining 2nd R
PRE: Datamining 2nd RPRE: Datamining 2nd R
PRE: Datamining 2nd R
 
Datamining R 1st
Datamining R 1stDatamining R 1st
Datamining R 1st
 
Datamining r 1st
Datamining r 1stDatamining r 1st
Datamining r 1st
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
R for you
R for youR for you
R for you
 
BA lab1.pptx
BA lab1.pptxBA lab1.pptx
BA lab1.pptx
 
Extending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::UtilExtending Operators in Perl with Operator::Util
Extending Operators in Perl with Operator::Util
 
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
جلسه سوم پایتون برای هکر های قانونی دوره مقدماتی پاییز ۹۲
 
R for Pirates. ESCCONF October 27, 2011
R for Pirates. ESCCONF October 27, 2011R for Pirates. ESCCONF October 27, 2011
R for Pirates. ESCCONF October 27, 2011
 
R programming language
R programming languageR programming language
R programming language
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
 
Dr Joshua Bishop, WWF Australia - Presentation - UNAA Vic Natural Capital Sem...
Dr Joshua Bishop, WWF Australia - Presentation - UNAA Vic Natural Capital Sem...Dr Joshua Bishop, WWF Australia - Presentation - UNAA Vic Natural Capital Sem...
Dr Joshua Bishop, WWF Australia - Presentation - UNAA Vic Natural Capital Sem...
 
useR!2010 matome
useR!2010 matomeuseR!2010 matome
useR!2010 matome
 
Datamining r 4th
Datamining r 4thDatamining r 4th
Datamining r 4th
 
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
 
Datamining R 4th
Datamining R 4thDatamining R 4th
Datamining R 4th
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し
 
R
RR
R
 
R v01 rprogamming_basic01 (R 프로그래밍 기본)
R v01 rprogamming_basic01 (R 프로그래밍 기본)R v01 rprogamming_basic01 (R 프로그래밍 기본)
R v01 rprogamming_basic01 (R 프로그래밍 기본)
 
Getting started with R when analysing GitHub commits
Getting started with R when analysing GitHub commitsGetting started with R when analysing GitHub commits
Getting started with R when analysing GitHub commits
 

Recently uploaded

Maximizing Board Effectiveness 2024 Webinar.pptx
Maximizing Board Effectiveness 2024 Webinar.pptxMaximizing Board Effectiveness 2024 Webinar.pptx
Maximizing Board Effectiveness 2024 Webinar.pptxOnBoard
 
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024BookNet Canada
 
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024BookNet Canada
 
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j
 
How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?XfilesPro
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationRidwan Fadjar
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationMichael W. Hawkins
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdfhans926745
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking MenDelhi Call girls
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsEnterprise Knowledge
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxMalak Abu Hammad
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Allon Mureinik
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticscarlostorres15106
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...shyamraj55
 
Slack Application Development 101 Slides
Slack Application Development 101 SlidesSlack Application Development 101 Slides
Slack Application Development 101 Slidespraypatel2
 
SIEMENS: RAPUNZEL – A Tale About Knowledge Graph
SIEMENS: RAPUNZEL – A Tale About Knowledge GraphSIEMENS: RAPUNZEL – A Tale About Knowledge Graph
SIEMENS: RAPUNZEL – A Tale About Knowledge GraphNeo4j
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machinePadma Pradeep
 

Recently uploaded (20)

Maximizing Board Effectiveness 2024 Webinar.pptx
Maximizing Board Effectiveness 2024 Webinar.pptxMaximizing Board Effectiveness 2024 Webinar.pptx
Maximizing Board Effectiveness 2024 Webinar.pptx
 
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
 
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
 
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
 
How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 Presentation
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptx
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
 
Slack Application Development 101 Slides
Slack Application Development 101 SlidesSlack Application Development 101 Slides
Slack Application Development 101 Slides
 
SIEMENS: RAPUNZEL – A Tale About Knowledge Graph
SIEMENS: RAPUNZEL – A Tale About Knowledge GraphSIEMENS: RAPUNZEL – A Tale About Knowledge Graph
SIEMENS: RAPUNZEL – A Tale About Knowledge Graph
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machine
 

Basics of data munging in R

  • 1. BASICS OF DATA MUNGING IN R &
  • 3. OUR GOAL • • – – – – –
  • 6. TEXT INTO R > read.table() • > read.csv() # Read a table > url <- 'http://robjhyndman.com/tsdldata/robert s/beards.dat' > read.table(url, header=FALSE,skip=4) V1 1 20 • 2 24 3 10 4 21 5 28 … … # Read a CSV file > Y <- read.csv(filename, header=F)
  • 9. MARKUP INTO R • > library(XML) > url2 <- ‘http://www.faa.gov/data_research/passengers_cargo/un ruly_passengers/’ > X <- readHTMLTable(url2, header=T, stringsAsFactors=FALSE)[[1]] > X Year Total • 1 1995 146 2 1996 184 3 1997 235 4 1998 5 1999 200 226 – 6 2000 227 7 2001 300 8 2002 306 9 2003 302 – 10 2004 330 11 2005 12 2006 226 156 – 13 2007 14 2008 176 134 – 15 2009 176 16 2010 148 17 2011 131 18 2012 12 as of April 10, 2012 –
  • 10. HTML
  • 11. FROM OTHER LANGUAGES > library(xlsx) > library(foreign) • • # SAS > read.xport(file) # Stata > read.dta(file) • # SPSS > read.spss(file) – – # Matlab > read.octave(file) – – # minitab > read.mtp(file) – – –
  • 12. CUSTOM PACKAGES > library(quantmod) • > library(twitteR) > library(RNYTimes) > library(RClimate) > getSymbols("GOOG") [1] "GOOG" • > searchTwitter('#ilovestatistics' – ,n=10)[[2]] [1]"Statistics: the best kind of homework #ilovestatistics #nerd #shouldhavebeenastatistician – #gradschoolproblems"
  • 14. TYPES OF DATA • > > a b <- <- c(1,2,3,4) matrix(c(1,2,3,4), nrow=2) – > c <- list("a"="fred", "b"="bill") – > d <- data.frame(b) – > a # VECTOR – [1] 1 2 3 4 • > b # MATRIX [,1] [,2] [1,] 1 3 [2,] 2 4 > c # LIST (Note the key-value structure) • $a [1] "fred" $b [1] "bill" – > d # DATA FRAME X1 X2 1 1 3 2 2 4
  • 15. CONVERSION AND COERCION > as.data.frame(a) • a 1 1 – 2 2 3 3 4 4 > data.frame(a) a – 1 1 2 2 3 3 • 4 4 > as.matrix(a, nrow=2) # WATCH OUT • [,1] [1,] 1 [2,] 2 – [3,] 3 [4,] 4 > matrix(a, nrow=2) # THIS INSTEAD! [,1] [,2] [1,] 1 3 [2,] 2 4
  • 16. VARIABLE INTERROGATION > Y <- runif(200) • > str(Y) num [1:200] 0.5053 0.3564 0.0359 0.7377 0.0302 ... > head(Y) # GIVE ME THE FIRST 5 [1] 0.50525553 0.35636648 0.03589792 • 0.73766891 0.03020607 0.50628327 – > tail(Y) # GIVE ME THE LAST 5 [1] 0.6612501 0.9930194 0.8392855 – 0.5459498 0.2587155 0.3704778 – > dim(Y) # NOPE! HE IS A VECTOR NULL – > length(Y) [1] 200
  • 18. USING INDEXES • > b [,1] [,2] • [1,] 1 3 • [2,] 2 4 > b[,1] # ALL ROWS, FIRST COLUMN • [1] 1 2 • > b[2,] # SECOND ROW, ALL COLUMNS [1] 2 4 > head(unemp) rank region Aug. 2012 Sept. 201 change > b[-1,] # ALL ROWS EXCEPT 1 14 14 alabama 8.5 8.3 -0.2 [1] 2 4 15 14 alaska 7.7 7.5 -0.2 27 27 arizona 8.3 8.2 -0.1 16 14 arkansas 7.3 7.1 -0.2 > b>3 2 2 california 10.6 10.2 -0.4 [,1] [,2] 17 14 colorado 8.2 8.0 -0.2 [1,] FALSE FALSE [2,] FALSE TRUE > unemp[unemp[4]>10,] rank region Aug. 2012 Sept. 201 change 2 2 california 10.6 10.2 -0.4 > GOOG[GOOG[,6]>768.00,6] 11 6 nevada 12.1 11.8 -0.3 GOOG.Adjusted 24 14 rhode island 10.7 10.5 -0.2 2012-10-04 768.05
  • 19. USE NAMES INSTEAD: $ > X <- 1 • > X$name <- "Fred" Warning message: In X$name <- "Fred" : Coercing LHS to a list > X$occupation <- "Doctor" > X$age <- 21 – > name <- c("Fred", "Bill") > X > occupation <- c("Doctor", "Dancer") [[1]] > people <- data.frame(name, occupation) [1] 1 > people $name name occupation [1] "Fred" 1 Fred Doctor 2 Bill Dancer $occupation > people$age <- 35 [1] "Doctor" > people name occupation age $age 1 Fred Doctor 35 [1] 21 2 Bill Dancer 35 people[people$name=="Fred",]$age=40 > X$name == X[2]
  • 22. SPEAK LIKE A NATIVE • > mymatrix <- matrix(rep(seq(2,6,by=2), 3), ncol = 3) – • > mymatrix [,1] [,2] [,3] – [1,] 2 2 2 • [2,] 4 4 4 [3,] 6 6 6 – • > apply(mymatrix, 1, sum) [1] 6 12 18 – • > apply(mymatrix, 2, sum) [1] 12 12 12
  • 23. LAPPLY • > lapply(mymatrix[,1],sum) [[1]] [1] 2 [[2]] • [1] 4 [[3]] [1] 6 – – > sapply(mymatrix[,1],sum) – [1] 2 4 6
  • 24. LONG TO WIDE Language Skill Users 1 R High 10 2 R Med 10 3 R Low 10 4 SAS High 1 5 SAS Med 25 6 SAS Low 20 Language Users.High Users.Med Users.Low 1 R 10 10 10 4 SAS 1 25 20
  • 25. RESHAPE GYMNASTICS > df <- • data.frame(c("R","R","R","SAS","SAS","SAS"), c("High","Med","Low","High","Med","Low"), c(10,5,10,1,25,20)); colnames(df) <- – c("Language","Skill","Users") > df Language Skill Users 1 R High 10 2 R Med 10 – 3 R Low 10 4 SAS High 1 • 5 6 SAS SAS Med Low 25 20 > reshape(df, idvar="Language", timevar="Skill", direction="wide") > reshape(df2, direction="long") Language Users.High Users.Med Users.Low Language Skill Users.High R.High R High 10 1 R 10 10 10 SAS.High SAS High 1 4 SAS 1 25 20 R.Med R Med 10 SAS.Med SAS Med 25 R.Low R Low 10 SAS.Low SAS Low 20 > df3[order(df3$Language),]
  • 26. NEW VARIABLES IN-PLACE > head(mtcars)[1] • Mazda RX4 mpg 21.0 Mazda RX4 Wag 21.0 Datsun 710 22.8 – Hornet 4 Drive 21.4 Hornet Sportabout 18.7 Valiant 18.1 – > head(with(mtcars, mpg*10)) # NEW VECTOR • [1] 210 210 228 214 187 181 – > head(transform(mtcars, electricdreams=mpg*10))[c(1,12)] mpg electricdreams – Mazda RX4 21.0 210 Mazda RX4 Wag 21.0 210 – Datsun 710 22.8 228 Hornet 4 Drive 21.4 214 Hornet Sportabout 18.7 187 Valiant 18.1 181
  • 27. MASH UP > head(unemp) region rank Aug. 2012 Sept. 201 change DEV state_code State Abbreviation 1 alabama 14 8.5 8.3 -0.2 0.4 01 AL 2 alaska 14 7.7 7.5 -0.2 -0.4 02 AK 3 arizona 27 8.3 8.2 -0.1 0.3 04 AZ 4 arkansas 14 7.3 7.1 -0.2 -0.8 05 AR 5 california 2 10.6 10.2 -0.4 2.3 06 CA 6 colorado 14 8.2 8.0 -0.2 0.1 08 CO