SlideShare a Scribd company logo
1 of 8
Download to read offline
Cleaning data
CCrause
Using exploring the enhanced tidyr version 1.0.0
First Import the raw data. I used weather data Upon first inspection I wanted to change a
couple of things
• Rows are stored as variables X1 - X31 represent days of the month
• Variable names are stored as rows. Max and mean temperature are variables and
should ideally be represented in their own column
• The first column called X is reduntant . X1 - X31 already captured the data on every
day of the month so I’ll remove it and change all column names to lower case
library(tidyverse)
weather_raw =readr::read_rds('../weather.rds')
weather_raw %>% glimpse()
## Observations: 286
## Variables: 35
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ...
## $ month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat...
## $ X1 <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ...
## $ X2 <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ...
## $ X3 <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",...
## $ X4 <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ...
## $ X5 <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ...
## $ X6 <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",...
## $ X7 <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ...
## $ X8 <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "...
## $ X9 <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",...
## $ X10 <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",...
## $ X11 <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ...
## $ X12 <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ...
## $ X13 <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ...
## $ X14 <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ...
## $ X15 <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ...
## $ X16 <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ...
## $ X17 <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",...
## $ X18 <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ...
## $ X19 <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ...
## $ X20 <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ...
## $ X21 <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ...
## $ X22 <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ...
## $ X23 <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",...
## $ X24 <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",...
## $ X25 <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",...
## $ X26 <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ...
## $ X27 <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ...
## $ X28 <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ...
## $ X29 <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ...
## $ X30 <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "...
## $ X31 <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
weather_tbl =weather_raw %>% as_tibble() %>% select(-X) %>%
set_names(names(.) %>% tolower)
weather_tbl %>% glimpse()
## Observations: 286
## Variables: 34
## $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ...
## $ month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12...
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat...
## $ x1 <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ...
## $ x2 <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ...
## $ x3 <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",...
## $ x4 <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ...
## $ x5 <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ...
## $ x6 <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",...
## $ x7 <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ...
## $ x8 <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "...
## $ x9 <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",...
## $ x10 <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",...
## $ x11 <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ...
## $ x12 <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ...
## $ x13 <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ...
## $ x14 <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ...
## $ x15 <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ...
## $ x16 <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ...
## $ x17 <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",...
## $ x18 <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ...
## $ x19 <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ...
## $ x20 <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ...
## $ x21 <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ...
## $ x22 <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ...
## $ x23 <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",...
## $ x24 <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",...
## $ x25 <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",...
## $ x26 <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ...
## $ x27 <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ...
## $ x28 <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ...
## $ x29 <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ...
## $ x30 <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "...
## $ x31 <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
Making wide datasets long
All the columns that start with the letter ‘X’ represent days of the month so there are really
just 2 variables: the day and then the measurement
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement')
## # A tibble: 8,866 x 5
## year month measure day measurement
## <int> <int> <chr> <chr> <chr>
## 1 2014 12 Max.TemperatureF x1 64
## 2 2014 12 Max.TemperatureF x2 42
## 3 2014 12 Max.TemperatureF x3 51
## 4 2014 12 Max.TemperatureF x4 43
## 5 2014 12 Max.TemperatureF x5 42
## 6 2014 12 Max.TemperatureF x6 45
## 7 2014 12 Max.TemperatureF x7 38
## 8 2014 12 Max.TemperatureF x8 29
## 9 2014 12 Max.TemperatureF x9 49
## 10 2014 12 Max.TemperatureF x10 48
## # ... with 8,856 more rows
Aditional tweaks
This worked very well but I dont like the x in front of the day, because it forces the column
type to take on a character value. You can remove the ‘prefix’ and then convert the type
very easily by adding two additional arguments namely names_prefix and names_ptypes. I
also dropped all the NA values
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer()))
## # A tibble: 8,046 x 5
## year month measure day measurement
## <int> <int> <chr> <int> <chr>
## 1 2014 12 Max.TemperatureF 1 64
## 2 2014 12 Max.TemperatureF 2 42
## 3 2014 12 Max.TemperatureF 3 51
## 4 2014 12 Max.TemperatureF 4 43
## 5 2014 12 Max.TemperatureF 5 42
## 6 2014 12 Max.TemperatureF 6 45
## 7 2014 12 Max.TemperatureF 7 38
## 8 2014 12 Max.TemperatureF 8 29
## 9 2014 12 Max.TemperatureF 9 49
## 10 2014 12 Max.TemperatureF 10 48
## # ... with 8,036 more rows
Making the dataset wider
The column “measure” contains different variables that would be better displayed in their
own column! Enter pivot_wider So grab the new column names from the “measure” column
and grab the values from the “measurement” column.
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer())) %>%
pivot_wider(names_from = measure,values_from = measurement)
## # A tibble: 366 x 25
## year month day Max.TemperatureF Mean.Temperatur~ Min.TemperatureF
## <int> <int> <int> <chr> <chr> <chr>
## 1 2014 12 1 64 52 39
## 2 2014 12 2 42 38 33
## 3 2014 12 3 51 44 37
## 4 2014 12 4 43 37 30
## 5 2014 12 5 42 34 26
## 6 2014 12 6 45 42 38
## 7 2014 12 7 38 30 21
## 8 2014 12 8 29 24 18
## 9 2014 12 9 49 39 29
## 10 2014 12 10 48 43 38
## # ... with 356 more rows, and 19 more variables: Max.Dew.PointF <chr>,
## # MeanDew.PointF <chr>, Min.DewpointF <chr>, Max.Humidity <chr>,
## # Mean.Humidity <chr>, Min.Humidity <chr>,
## # Max.Sea.Level.PressureIn <chr>, Mean.Sea.Level.PressureIn <chr>,
## # Min.Sea.Level.PressureIn <chr>, Max.VisibilityMiles <chr>,
## # Mean.VisibilityMiles <chr>, Min.VisibilityMiles <chr>,
## # Max.Wind.SpeedMPH <chr>, Mean.Wind.SpeedMPH <chr>,
## # Max.Gust.SpeedMPH <chr>, PrecipitationIn <chr>, CloudCover <chr>,
## # Events <chr>, WindDirDegrees <chr>
Finishing touches
This is almost done! I would like to combine three columns namely year, month and day
into a column called date. Let’s use the unite function
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer())) %>%
pivot_wider(names_from = measure,values_from = measurement) %>%
unite(date, year, month,day, sep = '/')
## # A tibble: 366 x 23
## date Max.TemperatureF Mean.Temperatur~ Min.TemperatureF Max.Dew.PointF
## <chr> <chr> <chr> <chr> <chr>
## 1 2014~ 64 52 39 46
## 2 2014~ 42 38 33 40
## 3 2014~ 51 44 37 49
## 4 2014~ 43 37 30 24
## 5 2014~ 42 34 26 37
## 6 2014~ 45 42 38 45
## 7 2014~ 38 30 21 36
## 8 2014~ 29 24 18 28
## 9 2014~ 49 39 29 49
## 10 2014~ 48 43 38 45
## # ... with 356 more rows, and 18 more variables: MeanDew.PointF <chr>,
## # Min.DewpointF <chr>, Max.Humidity <chr>, Mean.Humidity <chr>,
## # Min.Humidity <chr>, Max.Sea.Level.PressureIn <chr>,
## # Mean.Sea.Level.PressureIn <chr>, Min.Sea.Level.PressureIn <chr>,
## # Max.VisibilityMiles <chr>, Mean.VisibilityMiles <chr>,
## # Min.VisibilityMiles <chr>, Max.Wind.SpeedMPH <chr>,
## # Mean.Wind.SpeedMPH <chr>, Max.Gust.SpeedMPH <chr>,
## # PrecipitationIn <chr>, CloudCover <chr>, Events <chr>,
## # WindDirDegrees <chr>
Type conversions
Lastly I can see that a lot of columns contain numeric data, but are stored as text. Every
column except date and events should be converted to numeric.
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer())) %>%
pivot_wider(names_from = measure,values_from = measurement) %>%
unite(date, year, month,day, sep = '/') %>% glimpse()
## Observations: 366
## Variables: 23
## $ date <chr> "2014/12/1", "2014/12/2", "2014/12/3...
## $ Max.TemperatureF <chr> "64", "42", "51", "43", "42", "45", ...
## $ Mean.TemperatureF <chr> "52", "38", "44", "37", "34", "42", ...
## $ Min.TemperatureF <chr> "39", "33", "37", "30", "26", "38", ...
## $ Max.Dew.PointF <chr> "46", "40", "49", "24", "37", "45", ...
## $ MeanDew.PointF <chr> "40", "27", "42", "21", "25", "40", ...
## $ Min.DewpointF <chr> "26", "17", "24", "13", "12", "36", ...
## $ Max.Humidity <chr> "74", "92", "100", "69", "85", "100"...
## $ Mean.Humidity <chr> "63", "72", "79", "54", "66", "93", ...
## $ Min.Humidity <chr> "52", "51", "57", "39", "47", "85", ...
## $ Max.Sea.Level.PressureIn <chr> "30.45", "30.71", "30.4", "30.56", "...
## $ Mean.Sea.Level.PressureIn <chr> "30.13", "30.59", "30.07", "30.33", ...
## $ Min.Sea.Level.PressureIn <chr> "30.01", "30.4", "29.87", "30.09", "...
## $ Max.VisibilityMiles <chr> "10", "10", "10", "10", "10", "10", ...
## $ Mean.VisibilityMiles <chr> "10", "8", "5", "10", "10", "4", "10...
## $ Min.VisibilityMiles <chr> "10", "2", "1", "10", "5", "0", "5",...
## $ Max.Wind.SpeedMPH <chr> "22", "24", "29", "25", "22", "22", ...
## $ Mean.Wind.SpeedMPH <chr> "13", "15", "12", "12", "10", "8", "...
## $ Max.Gust.SpeedMPH <chr> "29", "29", "38", "33", "26", "25", ...
## $ PrecipitationIn <chr> "0.01", "0.10", "0.44", "0.00", "0.1...
## $ CloudCover <chr> "6", "7", "8", "3", "5", "8", "6", "...
## $ Events <chr> "Rain", "Rain-Snow", "Rain", "", "Ra...
## $ WindDirDegrees <chr> "268", "62", "254", "292", "61", "31...
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer())) %>%
pivot_wider(names_from = measure,values_from = measurement) %>%
unite(date, year, month,day, sep = '/') %>%
select(date,Events, everything()) %>%
mutate_at(vars(Max.TemperatureF:WindDirDegrees), funs(as.numeric))
## # A tibble: 366 x 23
## date Events Max.TemperatureF Mean.Temperatur~ Min.TemperatureF
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 2014~ Rain 64 52 39
## 2 2014~ Rain-~ 42 38 33
## 3 2014~ Rain 51 44 37
## 4 2014~ "" 43 37 30
## 5 2014~ Rain 42 34 26
## 6 2014~ Rain 45 42 38
## 7 2014~ Rain 38 30 21
## 8 2014~ Snow 29 24 18
## 9 2014~ Rain 49 39 29
## 10 2014~ Rain 48 43 38
## # ... with 356 more rows, and 18 more variables: Max.Dew.PointF <dbl>,
## # MeanDew.PointF <dbl>, Min.DewpointF <dbl>, Max.Humidity <dbl>,
## # Mean.Humidity <dbl>, Min.Humidity <dbl>,
## # Max.Sea.Level.PressureIn <dbl>, Mean.Sea.Level.PressureIn <dbl>,
## # Min.Sea.Level.PressureIn <dbl>, Max.VisibilityMiles <dbl>,
## # Mean.VisibilityMiles <dbl>, Min.VisibilityMiles <dbl>,
## # Max.Wind.SpeedMPH <dbl>, Mean.Wind.SpeedMPH <dbl>,
## # Max.Gust.SpeedMPH <dbl>, PrecipitationIn <dbl>, CloudCover <dbl>,
## # WindDirDegrees <dbl>
Convert the date column from text to date
Magrittr (the pipe ‘%>%’) makes it super easy to chain many different functions into one
another. It also makes it very easy to follow someone’s train of thought. It improves the
readability of code and, hence, makes it much easier to debug incorrect code!
weather_tbl %>% pivot_longer(cols = starts_with('x'),
names_to = 'day',
values_to = 'measurement',
values_drop_na = T,
names_prefix = 'x',
names_ptypes = list(day = integer())) %>%
pivot_wider(names_from = measure,values_from = measurement) %>%
unite(date, year, month,day, sep = '/') %>%
select(date,Events, everything()) %>%
mutate_at(vars(Max.TemperatureF:WindDirDegrees), funs(as.numeric)) %>%
mutate_at(vars(date),funs(as.Date))
## # A tibble: 366 x 23
## date Events Max.TemperatureF Mean.Temperatur~ Min.TemperatureF
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2014-12-01 Rain 64 52 39
## 2 2014-12-02 Rain-~ 42 38 33
## 3 2014-12-03 Rain 51 44 37
## 4 2014-12-04 "" 43 37 30
## 5 2014-12-05 Rain 42 34 26
## 6 2014-12-06 Rain 45 42 38
## 7 2014-12-07 Rain 38 30 21
## 8 2014-12-08 Snow 29 24 18
## 9 2014-12-09 Rain 49 39 29
## 10 2014-12-10 Rain 48 43 38
## # ... with 356 more rows, and 18 more variables: Max.Dew.PointF <dbl>,
## # MeanDew.PointF <dbl>, Min.DewpointF <dbl>, Max.Humidity <dbl>,
## # Mean.Humidity <dbl>, Min.Humidity <dbl>,
## # Max.Sea.Level.PressureIn <dbl>, Mean.Sea.Level.PressureIn <dbl>,
## # Min.Sea.Level.PressureIn <dbl>, Max.VisibilityMiles <dbl>,
## # Mean.VisibilityMiles <dbl>, Min.VisibilityMiles <dbl>,
## # Max.Wind.SpeedMPH <dbl>, Mean.Wind.SpeedMPH <dbl>,
## # Max.Gust.SpeedMPH <dbl>, PrecipitationIn <dbl>, CloudCover <dbl>,
## # WindDirDegrees <dbl>

More Related Content

What's hot

第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)Wataru Shito
 
Clustering and Visualisation using R programming
Clustering and Visualisation using R programmingClustering and Visualisation using R programming
Clustering and Visualisation using R programmingNixon Mendez
 
第6回 関数とフロー制御
第6回 関数とフロー制御第6回 関数とフロー制御
第6回 関数とフロー制御Wataru Shito
 
Raspberry Pi à la GroovyFX
Raspberry Pi à la GroovyFXRaspberry Pi à la GroovyFX
Raspberry Pi à la GroovyFXStephen Chin
 
第3回 データフレームの基本操作 その1
第3回 データフレームの基本操作 その1第3回 データフレームの基本操作 その1
第3回 データフレームの基本操作 その1Wataru Shito
 

What's hot (6)

Python 1
Python 1Python 1
Python 1
 
第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)
 
Clustering and Visualisation using R programming
Clustering and Visualisation using R programmingClustering and Visualisation using R programming
Clustering and Visualisation using R programming
 
第6回 関数とフロー制御
第6回 関数とフロー制御第6回 関数とフロー制御
第6回 関数とフロー制御
 
Raspberry Pi à la GroovyFX
Raspberry Pi à la GroovyFXRaspberry Pi à la GroovyFX
Raspberry Pi à la GroovyFX
 
第3回 データフレームの基本操作 その1
第3回 データフレームの基本操作 その1第3回 データフレームの基本操作 その1
第3回 データフレームの基本操作 その1
 

Similar to Wrangling data the tidy way with the tidyverse

Time Series Analysis and Mining with R
Time Series Analysis and Mining with RTime Series Analysis and Mining with R
Time Series Analysis and Mining with RYanchang Zhao
 
visualisasi data praktik pakai excel, py
visualisasi data praktik pakai excel, pyvisualisasi data praktik pakai excel, py
visualisasi data praktik pakai excel, pyElmaLyrics
 
Data manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsyData manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsySmartHinJ
 
Handling missing data and outliers
Handling missing data and outliersHandling missing data and outliers
Handling missing data and outliersCasper Crause
 
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPlotly
 
Table of Useful R commands.
Table of Useful R commands.Table of Useful R commands.
Table of Useful R commands.Dr. Volkan OBAN
 
Introduction to python programming 1
Introduction to python programming   1Introduction to python programming   1
Introduction to python programming 1Giovanni Della Lunga
 
The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210Mahmoud Samir Fayed
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonMoses Boudourides
 
Text Analysis with Machine Learning
Text Analysis with Machine LearningText Analysis with Machine Learning
Text Analysis with Machine LearningTuri, Inc.
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadiesAlicia Pérez
 
A quick introduction to R
A quick introduction to RA quick introduction to R
A quick introduction to RAngshuman Saha
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出すTakashi Kitano
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Pythonpugpe
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptxDevikaRaj14
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptxDevikaRaj14
 
Bigger Data v Better Math
Bigger Data v Better MathBigger Data v Better Math
Bigger Data v Better MathBrent Schneeman
 

Similar to Wrangling data the tidy way with the tidyverse (20)

dplyr
dplyrdplyr
dplyr
 
Time Series Analysis and Mining with R
Time Series Analysis and Mining with RTime Series Analysis and Mining with R
Time Series Analysis and Mining with R
 
visualisasi data praktik pakai excel, py
visualisasi data praktik pakai excel, pyvisualisasi data praktik pakai excel, py
visualisasi data praktik pakai excel, py
 
Data manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsyData manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsy
 
Handling missing data and outliers
Handling missing data and outliersHandling missing data and outliers
Handling missing data and outliers
 
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
 
Table of Useful R commands.
Table of Useful R commands.Table of Useful R commands.
Table of Useful R commands.
 
Introduction to python programming 1
Introduction to python programming   1Introduction to python programming   1
Introduction to python programming 1
 
The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Text Analysis with Machine Learning
Text Analysis with Machine LearningText Analysis with Machine Learning
Text Analysis with Machine Learning
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadies
 
A quick introduction to R
A quick introduction to RA quick introduction to R
A quick introduction to R
 
Time Series.pptx
Time Series.pptxTime Series.pptx
Time Series.pptx
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出す
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptx
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptx
 
Introduction to tibbles
Introduction to tibblesIntroduction to tibbles
Introduction to tibbles
 
Bigger Data v Better Math
Bigger Data v Better MathBigger Data v Better Math
Bigger Data v Better Math
 

More from Casper Crause

Integrating R and Power BI
Integrating R and Power BIIntegrating R and Power BI
Integrating R and Power BICasper Crause
 
Company segmentation - an approach with R
Company segmentation - an approach with RCompany segmentation - an approach with R
Company segmentation - an approach with RCasper Crause
 
How to read multiple excel files - With R
How to read  multiple excel files  - With RHow to read  multiple excel files  - With R
How to read multiple excel files - With RCasper Crause
 
Storytelling By Visualization
Storytelling By Visualization Storytelling By Visualization
Storytelling By Visualization Casper Crause
 
Comparing Co2 Emissions Around The Globe
Comparing Co2 Emissions Around The GlobeComparing Co2 Emissions Around The Globe
Comparing Co2 Emissions Around The GlobeCasper Crause
 
Understanding control-flow
Understanding control-flowUnderstanding control-flow
Understanding control-flowCasper Crause
 
Levelling up your chart skills
Levelling up your chart skillsLevelling up your chart skills
Levelling up your chart skillsCasper Crause
 
Project portfolio for Casper Crause
Project portfolio for Casper CrauseProject portfolio for Casper Crause
Project portfolio for Casper CrauseCasper Crause
 

More from Casper Crause (8)

Integrating R and Power BI
Integrating R and Power BIIntegrating R and Power BI
Integrating R and Power BI
 
Company segmentation - an approach with R
Company segmentation - an approach with RCompany segmentation - an approach with R
Company segmentation - an approach with R
 
How to read multiple excel files - With R
How to read  multiple excel files  - With RHow to read  multiple excel files  - With R
How to read multiple excel files - With R
 
Storytelling By Visualization
Storytelling By Visualization Storytelling By Visualization
Storytelling By Visualization
 
Comparing Co2 Emissions Around The Globe
Comparing Co2 Emissions Around The GlobeComparing Co2 Emissions Around The Globe
Comparing Co2 Emissions Around The Globe
 
Understanding control-flow
Understanding control-flowUnderstanding control-flow
Understanding control-flow
 
Levelling up your chart skills
Levelling up your chart skillsLevelling up your chart skills
Levelling up your chart skills
 
Project portfolio for Casper Crause
Project portfolio for Casper CrauseProject portfolio for Casper Crause
Project portfolio for Casper Crause
 

Recently uploaded

April 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's AnalysisApril 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's Analysismanisha194592
 
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...amitlee9823
 
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% SecureCall me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% SecurePooja Nehwal
 
Edukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxEdukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxolyaivanovalion
 
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...amitlee9823
 
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort Service
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort ServiceBDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort Service
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort ServiceDelhi Call girls
 
Midocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxMidocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxolyaivanovalion
 
FESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfFESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfMarinCaroMartnezBerg
 
Introduction-to-Machine-Learning (1).pptx
Introduction-to-Machine-Learning (1).pptxIntroduction-to-Machine-Learning (1).pptx
Introduction-to-Machine-Learning (1).pptxfirstjob4
 
Generative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and MilvusGenerative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and MilvusTimothy Spann
 
Data-Analysis for Chicago Crime Data 2023
Data-Analysis for Chicago Crime Data  2023Data-Analysis for Chicago Crime Data  2023
Data-Analysis for Chicago Crime Data 2023ymrp368
 
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Valters Lauzums
 
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Delhi Call girls
 
Ravak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxRavak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxolyaivanovalion
 
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 nightCheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 nightDelhi Call girls
 
Halmar dropshipping via API with DroFx
Halmar  dropshipping  via API with DroFxHalmar  dropshipping  via API with DroFx
Halmar dropshipping via API with DroFxolyaivanovalion
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxolyaivanovalion
 
BabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptxBabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptxolyaivanovalion
 
Zuja dropshipping via API with DroFx.pptx
Zuja dropshipping via API with DroFx.pptxZuja dropshipping via API with DroFx.pptx
Zuja dropshipping via API with DroFx.pptxolyaivanovalion
 

Recently uploaded (20)

April 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's AnalysisApril 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's Analysis
 
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...
Call Girls Hsr Layout Just Call 👗 7737669865 👗 Top Class Call Girl Service Ba...
 
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% SecureCall me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
 
Edukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxEdukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFx
 
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...
Chintamani Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore ...
 
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort Service
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort ServiceBDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort Service
BDSM⚡Call Girls in Mandawali Delhi >༒8448380779 Escort Service
 
Midocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxMidocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFx
 
FESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfFESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdf
 
Sampling (random) method and Non random.ppt
Sampling (random) method and Non random.pptSampling (random) method and Non random.ppt
Sampling (random) method and Non random.ppt
 
Introduction-to-Machine-Learning (1).pptx
Introduction-to-Machine-Learning (1).pptxIntroduction-to-Machine-Learning (1).pptx
Introduction-to-Machine-Learning (1).pptx
 
Generative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and MilvusGenerative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and Milvus
 
Data-Analysis for Chicago Crime Data 2023
Data-Analysis for Chicago Crime Data  2023Data-Analysis for Chicago Crime Data  2023
Data-Analysis for Chicago Crime Data 2023
 
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
 
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
 
Ravak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxRavak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptx
 
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 nightCheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
 
Halmar dropshipping via API with DroFx
Halmar  dropshipping  via API with DroFxHalmar  dropshipping  via API with DroFx
Halmar dropshipping via API with DroFx
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptx
 
BabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptxBabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptx
 
Zuja dropshipping via API with DroFx.pptx
Zuja dropshipping via API with DroFx.pptxZuja dropshipping via API with DroFx.pptx
Zuja dropshipping via API with DroFx.pptx
 

Wrangling data the tidy way with the tidyverse

  • 1. Cleaning data CCrause Using exploring the enhanced tidyr version 1.0.0 First Import the raw data. I used weather data Upon first inspection I wanted to change a couple of things • Rows are stored as variables X1 - X31 represent days of the month • Variable names are stored as rows. Max and mean temperature are variables and should ideally be represented in their own column • The first column called X is reduntant . X1 - X31 already captured the data on every day of the month so I’ll remove it and change all column names to lower case
  • 2. library(tidyverse) weather_raw =readr::read_rds('../weather.rds') weather_raw %>% glimpse() ## Observations: 286 ## Variables: 35 ## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,... ## $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ... ## $ month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12... ## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat... ## $ X1 <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ... ## $ X2 <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ... ## $ X3 <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",... ## $ X4 <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ... ## $ X5 <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ... ## $ X6 <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",... ## $ X7 <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ... ## $ X8 <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "... ## $ X9 <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",... ## $ X10 <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",... ## $ X11 <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ... ## $ X12 <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ... ## $ X13 <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ... ## $ X14 <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ... ## $ X15 <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ... ## $ X16 <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ... ## $ X17 <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",... ## $ X18 <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ... ## $ X19 <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ... ## $ X20 <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ... ## $ X21 <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ... ## $ X22 <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ... ## $ X23 <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",... ## $ X24 <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",... ## $ X25 <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",... ## $ X26 <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ... ## $ X27 <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ... ## $ X28 <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ... ## $ X29 <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ... ## $ X30 <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "... ## $ X31 <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30...
  • 3. weather_tbl =weather_raw %>% as_tibble() %>% select(-X) %>% set_names(names(.) %>% tolower) weather_tbl %>% glimpse() ## Observations: 286 ## Variables: 34 ## $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, ... ## $ month <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12... ## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperat... ## $ x1 <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", ... ## $ x2 <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", ... ## $ x3 <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57",... ## $ x4 <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", ... ## $ x5 <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", ... ## $ x6 <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85",... ## $ x7 <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", ... ## $ x8 <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "... ## $ x9 <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86",... ## $ x10 <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89",... ## $ x11 <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", ... ## $ x12 <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", ... ## $ x13 <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", ... ## $ x14 <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", ... ## $ x15 <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", ... ## $ x16 <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", ... ## $ x17 <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70",... ## $ x18 <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", ... ## $ x19 <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", ... ## $ x20 <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", ... ## $ x21 <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", ... ## $ x22 <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", ... ## $ x23 <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82",... ## $ x24 <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96",... ## $ x25 <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49",... ## $ x26 <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", ... ## $ x27 <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", ... ## $ x28 <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", ... ## $ x29 <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", ... ## $ x30 <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "... ## $ x31 <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30... Making wide datasets long All the columns that start with the letter ‘X’ represent days of the month so there are really just 2 variables: the day and then the measurement weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement')
  • 4. ## # A tibble: 8,866 x 5 ## year month measure day measurement ## <int> <int> <chr> <chr> <chr> ## 1 2014 12 Max.TemperatureF x1 64 ## 2 2014 12 Max.TemperatureF x2 42 ## 3 2014 12 Max.TemperatureF x3 51 ## 4 2014 12 Max.TemperatureF x4 43 ## 5 2014 12 Max.TemperatureF x5 42 ## 6 2014 12 Max.TemperatureF x6 45 ## 7 2014 12 Max.TemperatureF x7 38 ## 8 2014 12 Max.TemperatureF x8 29 ## 9 2014 12 Max.TemperatureF x9 49 ## 10 2014 12 Max.TemperatureF x10 48 ## # ... with 8,856 more rows Aditional tweaks This worked very well but I dont like the x in front of the day, because it forces the column type to take on a character value. You can remove the ‘prefix’ and then convert the type very easily by adding two additional arguments namely names_prefix and names_ptypes. I also dropped all the NA values weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) ## # A tibble: 8,046 x 5 ## year month measure day measurement ## <int> <int> <chr> <int> <chr> ## 1 2014 12 Max.TemperatureF 1 64 ## 2 2014 12 Max.TemperatureF 2 42 ## 3 2014 12 Max.TemperatureF 3 51 ## 4 2014 12 Max.TemperatureF 4 43 ## 5 2014 12 Max.TemperatureF 5 42 ## 6 2014 12 Max.TemperatureF 6 45 ## 7 2014 12 Max.TemperatureF 7 38 ## 8 2014 12 Max.TemperatureF 8 29 ## 9 2014 12 Max.TemperatureF 9 49 ## 10 2014 12 Max.TemperatureF 10 48 ## # ... with 8,036 more rows Making the dataset wider The column “measure” contains different variables that would be better displayed in their own column! Enter pivot_wider So grab the new column names from the “measure” column and grab the values from the “measurement” column.
  • 5. weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) %>% pivot_wider(names_from = measure,values_from = measurement) ## # A tibble: 366 x 25 ## year month day Max.TemperatureF Mean.Temperatur~ Min.TemperatureF ## <int> <int> <int> <chr> <chr> <chr> ## 1 2014 12 1 64 52 39 ## 2 2014 12 2 42 38 33 ## 3 2014 12 3 51 44 37 ## 4 2014 12 4 43 37 30 ## 5 2014 12 5 42 34 26 ## 6 2014 12 6 45 42 38 ## 7 2014 12 7 38 30 21 ## 8 2014 12 8 29 24 18 ## 9 2014 12 9 49 39 29 ## 10 2014 12 10 48 43 38 ## # ... with 356 more rows, and 19 more variables: Max.Dew.PointF <chr>, ## # MeanDew.PointF <chr>, Min.DewpointF <chr>, Max.Humidity <chr>, ## # Mean.Humidity <chr>, Min.Humidity <chr>, ## # Max.Sea.Level.PressureIn <chr>, Mean.Sea.Level.PressureIn <chr>, ## # Min.Sea.Level.PressureIn <chr>, Max.VisibilityMiles <chr>, ## # Mean.VisibilityMiles <chr>, Min.VisibilityMiles <chr>, ## # Max.Wind.SpeedMPH <chr>, Mean.Wind.SpeedMPH <chr>, ## # Max.Gust.SpeedMPH <chr>, PrecipitationIn <chr>, CloudCover <chr>, ## # Events <chr>, WindDirDegrees <chr> Finishing touches This is almost done! I would like to combine three columns namely year, month and day into a column called date. Let’s use the unite function weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) %>% pivot_wider(names_from = measure,values_from = measurement) %>% unite(date, year, month,day, sep = '/')
  • 6. ## # A tibble: 366 x 23 ## date Max.TemperatureF Mean.Temperatur~ Min.TemperatureF Max.Dew.PointF ## <chr> <chr> <chr> <chr> <chr> ## 1 2014~ 64 52 39 46 ## 2 2014~ 42 38 33 40 ## 3 2014~ 51 44 37 49 ## 4 2014~ 43 37 30 24 ## 5 2014~ 42 34 26 37 ## 6 2014~ 45 42 38 45 ## 7 2014~ 38 30 21 36 ## 8 2014~ 29 24 18 28 ## 9 2014~ 49 39 29 49 ## 10 2014~ 48 43 38 45 ## # ... with 356 more rows, and 18 more variables: MeanDew.PointF <chr>, ## # Min.DewpointF <chr>, Max.Humidity <chr>, Mean.Humidity <chr>, ## # Min.Humidity <chr>, Max.Sea.Level.PressureIn <chr>, ## # Mean.Sea.Level.PressureIn <chr>, Min.Sea.Level.PressureIn <chr>, ## # Max.VisibilityMiles <chr>, Mean.VisibilityMiles <chr>, ## # Min.VisibilityMiles <chr>, Max.Wind.SpeedMPH <chr>, ## # Mean.Wind.SpeedMPH <chr>, Max.Gust.SpeedMPH <chr>, ## # PrecipitationIn <chr>, CloudCover <chr>, Events <chr>, ## # WindDirDegrees <chr> Type conversions Lastly I can see that a lot of columns contain numeric data, but are stored as text. Every column except date and events should be converted to numeric. weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) %>% pivot_wider(names_from = measure,values_from = measurement) %>% unite(date, year, month,day, sep = '/') %>% glimpse() ## Observations: 366 ## Variables: 23 ## $ date <chr> "2014/12/1", "2014/12/2", "2014/12/3... ## $ Max.TemperatureF <chr> "64", "42", "51", "43", "42", "45", ... ## $ Mean.TemperatureF <chr> "52", "38", "44", "37", "34", "42", ... ## $ Min.TemperatureF <chr> "39", "33", "37", "30", "26", "38", ... ## $ Max.Dew.PointF <chr> "46", "40", "49", "24", "37", "45", ... ## $ MeanDew.PointF <chr> "40", "27", "42", "21", "25", "40", ... ## $ Min.DewpointF <chr> "26", "17", "24", "13", "12", "36", ... ## $ Max.Humidity <chr> "74", "92", "100", "69", "85", "100"... ## $ Mean.Humidity <chr> "63", "72", "79", "54", "66", "93", ... ## $ Min.Humidity <chr> "52", "51", "57", "39", "47", "85", ... ## $ Max.Sea.Level.PressureIn <chr> "30.45", "30.71", "30.4", "30.56", "...
  • 7. ## $ Mean.Sea.Level.PressureIn <chr> "30.13", "30.59", "30.07", "30.33", ... ## $ Min.Sea.Level.PressureIn <chr> "30.01", "30.4", "29.87", "30.09", "... ## $ Max.VisibilityMiles <chr> "10", "10", "10", "10", "10", "10", ... ## $ Mean.VisibilityMiles <chr> "10", "8", "5", "10", "10", "4", "10... ## $ Min.VisibilityMiles <chr> "10", "2", "1", "10", "5", "0", "5",... ## $ Max.Wind.SpeedMPH <chr> "22", "24", "29", "25", "22", "22", ... ## $ Mean.Wind.SpeedMPH <chr> "13", "15", "12", "12", "10", "8", "... ## $ Max.Gust.SpeedMPH <chr> "29", "29", "38", "33", "26", "25", ... ## $ PrecipitationIn <chr> "0.01", "0.10", "0.44", "0.00", "0.1... ## $ CloudCover <chr> "6", "7", "8", "3", "5", "8", "6", "... ## $ Events <chr> "Rain", "Rain-Snow", "Rain", "", "Ra... ## $ WindDirDegrees <chr> "268", "62", "254", "292", "61", "31... weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) %>% pivot_wider(names_from = measure,values_from = measurement) %>% unite(date, year, month,day, sep = '/') %>% select(date,Events, everything()) %>% mutate_at(vars(Max.TemperatureF:WindDirDegrees), funs(as.numeric)) ## # A tibble: 366 x 23 ## date Events Max.TemperatureF Mean.Temperatur~ Min.TemperatureF ## <chr> <chr> <dbl> <dbl> <dbl> ## 1 2014~ Rain 64 52 39 ## 2 2014~ Rain-~ 42 38 33 ## 3 2014~ Rain 51 44 37 ## 4 2014~ "" 43 37 30 ## 5 2014~ Rain 42 34 26 ## 6 2014~ Rain 45 42 38 ## 7 2014~ Rain 38 30 21 ## 8 2014~ Snow 29 24 18 ## 9 2014~ Rain 49 39 29 ## 10 2014~ Rain 48 43 38 ## # ... with 356 more rows, and 18 more variables: Max.Dew.PointF <dbl>, ## # MeanDew.PointF <dbl>, Min.DewpointF <dbl>, Max.Humidity <dbl>, ## # Mean.Humidity <dbl>, Min.Humidity <dbl>, ## # Max.Sea.Level.PressureIn <dbl>, Mean.Sea.Level.PressureIn <dbl>, ## # Min.Sea.Level.PressureIn <dbl>, Max.VisibilityMiles <dbl>, ## # Mean.VisibilityMiles <dbl>, Min.VisibilityMiles <dbl>, ## # Max.Wind.SpeedMPH <dbl>, Mean.Wind.SpeedMPH <dbl>, ## # Max.Gust.SpeedMPH <dbl>, PrecipitationIn <dbl>, CloudCover <dbl>, ## # WindDirDegrees <dbl>
  • 8. Convert the date column from text to date Magrittr (the pipe ‘%>%’) makes it super easy to chain many different functions into one another. It also makes it very easy to follow someone’s train of thought. It improves the readability of code and, hence, makes it much easier to debug incorrect code! weather_tbl %>% pivot_longer(cols = starts_with('x'), names_to = 'day', values_to = 'measurement', values_drop_na = T, names_prefix = 'x', names_ptypes = list(day = integer())) %>% pivot_wider(names_from = measure,values_from = measurement) %>% unite(date, year, month,day, sep = '/') %>% select(date,Events, everything()) %>% mutate_at(vars(Max.TemperatureF:WindDirDegrees), funs(as.numeric)) %>% mutate_at(vars(date),funs(as.Date)) ## # A tibble: 366 x 23 ## date Events Max.TemperatureF Mean.Temperatur~ Min.TemperatureF ## <date> <chr> <dbl> <dbl> <dbl> ## 1 2014-12-01 Rain 64 52 39 ## 2 2014-12-02 Rain-~ 42 38 33 ## 3 2014-12-03 Rain 51 44 37 ## 4 2014-12-04 "" 43 37 30 ## 5 2014-12-05 Rain 42 34 26 ## 6 2014-12-06 Rain 45 42 38 ## 7 2014-12-07 Rain 38 30 21 ## 8 2014-12-08 Snow 29 24 18 ## 9 2014-12-09 Rain 49 39 29 ## 10 2014-12-10 Rain 48 43 38 ## # ... with 356 more rows, and 18 more variables: Max.Dew.PointF <dbl>, ## # MeanDew.PointF <dbl>, Min.DewpointF <dbl>, Max.Humidity <dbl>, ## # Mean.Humidity <dbl>, Min.Humidity <dbl>, ## # Max.Sea.Level.PressureIn <dbl>, Mean.Sea.Level.PressureIn <dbl>, ## # Min.Sea.Level.PressureIn <dbl>, Max.VisibilityMiles <dbl>, ## # Mean.VisibilityMiles <dbl>, Min.VisibilityMiles <dbl>, ## # Max.Wind.SpeedMPH <dbl>, Mean.Wind.SpeedMPH <dbl>, ## # Max.Gust.SpeedMPH <dbl>, PrecipitationIn <dbl>, CloudCover <dbl>, ## # WindDirDegrees <dbl>