Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Data Visualization in R

849 views

Published on

Data Visualization in R
With code and examples
From Greater Toronto Area (GTA) R User Group
2016-03-09

Published in: Data & Analytics
  • Be the first to comment

Data Visualization in R

  1. 1. Data Visualization in R Greater Toronto Area R User Group (RGTA) Wednesday, March 9, 2016 @everydayanalyst Myles Harrison www.everydayanalytics.ca myles@mylesharrison.com
  2. 2. ### BASIC PLOTTING ### # basic scatterplot x <- rnorm(100) y <- rnorm(100) plot(x,y)
  3. 3. ### BASIC PLOTTING ### # basic scatterplot x <- rnorm(100) y <- rnorm(100) plot(x,y)
  4. 4. # scatterplot matrix by default z <- rnorm(100) d <- data.frame(x,y,z) plot(d)
  5. 5. # scatterplot matrix by default z <- rnorm(100) d <- data.frame(x,y,z) plot(d)
  6. 6. # More scatterplots plot(mtcars$disp, mtcars$mpg)
  7. 7. # More scatterplots plot(mtcars$disp, mtcars$mpg)
  8. 8. # More scatterplots plot(mtcars$disp, mtcars$mpg) plot(mtcars$disp, mtcars$mpg, pch=16, col='red', xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set')
  9. 9. # More scatterplots plot(mtcars$disp, mtcars$mpg) plot(mtcars$disp, mtcars$mpg, pch=16, col='red', xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set')
  10. 10. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set')
  11. 11. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set')
  12. 12. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') col=mtcars$gear
  13. 13. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, cex=mtcars$cyl, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') col=mtcars$gear
  14. 14. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, cex=mtcars$cyl, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') col=mtcars$gear
  15. 15. plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') plot(mtcars$disp, mtcars$mpg, pch=16, col=mtcars$gear, cex=mtcars$cyl, xlab='Displacement (cu. in.)', ylab='Miles per Gallon (mpg)', main='MPG vs. Displacement for mtcars data set') col=mtcars$gear cex=mtcars$cyl
  16. 16. gears <- unique(sort(mtcars$gear)) legend("topright",legend=gears,fill=gears)
  17. 17. gears <- unique(sort(mtcars$gear)) legend("topright",legend=gears,fill=gears)
  18. 18. gears <- unique(sort(mtcars$gear)) legend("topright",legend=gears,fill=gears) l <- lm(mpg~disp, data=mtcars) abline(l)
  19. 19. gears <- unique(sort(mtcars$gear)) legend("topright",legend=gears,fill=gears) l <- lm(mpg~disp, data=mtcars) abline(l)
  20. 20. # line plots y <- cumsum(rnorm(1000)) plot(y,type='l')type='l'
  21. 21. # line plots y <- cumsum(rnorm(1000)) plot(y,type='l')type='l'
  22. 22. y2 <- cumsum(rnorm(1000)) plot(y,type='l',lwd=2, col='red', xlab='x variable', ylab='y variable', main='y vs. x')
  23. 23. y2 <- cumsum(rnorm(1000)) plot(y,type='l',lwd=2, col='red', xlab='x variable', ylab='y variable', main='y vs. x')
  24. 24. y2 <- cumsum(rnorm(1000)) plot(y,type='l',lwd=2, col='red', xlab='x variable', ylab='y variable', main='y vs. x') lines(y2,lwd=1,lty=2,col='black')
  25. 25. y2 <- cumsum(rnorm(1000)) plot(y,type='l',lwd=2, col='red', xlab='x variable', ylab='y variable', main='y vs. x') lines(y2,lwd=1,lty=2,col='black')
  26. 26. 1 2 3 4 5 6
  27. 27. # Histograms & normal curves # x <- rnorm(10000) hist(x)
  28. 28. # Histograms & normal curves # x <- rnorm(10000) hist(x)
  29. 29. # Histograms & normal curves # x <- rnorm(10000) hist(x) hist(x, breaks=100, col='red')
  30. 30. # Histograms & normal curves # x <- rnorm(10000) hist(x) hist(x, breaks=100, col='red')
  31. 31. h <- hist(x,breaks=100, col='red') plot(h$mids, h$counts, type='b', col='red', pch=16, xlab='bin', ylab='count')
  32. 32. h <- hist(x,breaks=100, col='red') plot(h$mids, h$counts, type='b', col='red', pch=16, xlab='bin', ylab='count')
  33. 33. # Overlay / interpolate density plot(h, col='red')
  34. 34. # Overlay / interpolate density plot(h, col='red')
  35. 35. # Overlay / interpolate density plot(h, col='red') multiplier <- h$counts / h$density curve(dnorm(x)*multiplier[1], add=T, lwd=2)
  36. 36. # Overlay / interpolate density plot(h, col='red') multiplier <- h$counts / h$density curve(dnorm(x)*multiplier[1], add=T, lwd=2)
  37. 37. # Boxplots # boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon")
  38. 38. # Boxplots # boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon")
  39. 39. # Boxplots # boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon") boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon", col=c("red","green","blue"))
  40. 40. # Boxplots # boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon") boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders", ylab="Miles Per Gallon", col=c("red","green","blue"))
  41. 41. # with jitter # x <- rnorm(500, mean=1, sd=5) y <- rnorm(500, mean=5, sd=6.6) z <- rnorm(500, mean=10, sd=3.3) d <- data.frame(x,y,z) boxplot(d)
  42. 42. # with jitter # x <- rnorm(500, mean=1, sd=5) y <- rnorm(500, mean=5, sd=6.6) z <- rnorm(500, mean=10, sd=3.3) d <- data.frame(x,y,z) boxplot(d)
  43. 43. # with jitter # x <- rnorm(500, mean=1, sd=5) y <- rnorm(500, mean=5, sd=6.6) z <- rnorm(500, mean=10, sd=3.3) d <- data.frame(x,y,z) boxplot(d) stripchart(d, vertical=TRUE, method="jitter", add=TRUE, pch=16, col = rgb(0,0,0,0.5), cex=0.1)col = rgb(0,0,0,0.5)
  44. 44. # with jitter # x <- rnorm(500, mean=1, sd=5) y <- rnorm(500, mean=5, sd=6.6) z <- rnorm(500, mean=10, sd=3.3) d <- data.frame(x,y,z) boxplot(d) stripchart(d, vertical=TRUE, method="jitter", add=TRUE, pch=16, col = rgb(0,0,0,0.5), cex=0.1)col = rgb(0,0,0,0.5)
  45. 45. # violin plot # library(vioplot) vioplot(x,y,z, col='white', names=c('x','y','z')) title('Violin Plot')
  46. 46. # violin plot # library(vioplot) vioplot(x,y,z, col='white', names=c('x','y','z')) title('Violin Plot')
  47. 47. Credit: Earl F. Glynn http://research.stowers-institute.org/efg/R/Color/Chart/index.htm
  48. 48. ## PALETTES ##
  49. 49. # COLOR PALETTES IN PRACTICE y <- cumsum(rnorm(1000)) plot(y, type='o')
  50. 50. n = 256 palette(heat.colors(n)) # Create scaled colour vector cols <- (y - min(y))/(max(y) - min(y))*n # plot plot(y,col=cols, pch=16)
  51. 51. n = 256 palette(heat.colors(n)) # Create scaled colour vector cols <- (y - min(y))/(max(y) - min(y))*n # plot plot(y,col=cols, pch=16)
  52. 52. # ColorRampPalette myPalette <- colorRampPalette(c("red","blue"))(256) palette(myPalette) plot(y,col=cols,pch=16)
  53. 53. # ColorRampPalette myPalette <- colorRampPalette(c("red","blue"))(256) palette(myPalette) plot(y,col=cols,pch=16)
  54. 54. ## RColorBrewer ## library(RColorBrewer)
  55. 55. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq')
  56. 56. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq')
  57. 57. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq') # Qualitative - for categorical data display.brewer.all(type='qual')
  58. 58. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq') # Qualitative - for categorical data display.brewer.all(type='qual')
  59. 59. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq') # Qualitative - for categorical data display.brewer.all(type='qual') # Diverging emphasizes mid and extremes (e.g. -ve to +ve) display.brewer.all(type='div')
  60. 60. ## RColorBrewer ## library(RColorBrewer) # Sequential - for continuous data from low to high display.brewer.all(type='seq') # Qualitative - for categorical data display.brewer.all(type='qual') # Diverging emphasizes mid and extremes (e.g. -ve to +ve) display.brewer.all(type='div')
  61. 61. ## RColorBrewer in practice ## # dense scatterplot x <- c(rnorm(5000, sd=3.4, mean=5), rnorm(5000,sd=1.5,mean=-1), rnorm(5000,sd=2.2,mean=11)) y <- c(rnorm(5000, sd=1.6, mean=-2), rnorm(5000,sd=2.5,mean=10), rnorm(5000,sd=1,mean=5)) plot(x,y)
  62. 62. ## RColorBrewer in practice ## # dense scatterplot x <- c(rnorm(5000, sd=3.4, mean=5), rnorm(5000,sd=1.5,mean=-1), rnorm(5000,sd=2.2,mean=11)) y <- c(rnorm(5000, sd=1.6, mean=-2), rnorm(5000,sd=2.5,mean=10), rnorm(5000,sd=1,mean=5)) plot(x,y)
  63. 63. ## RColorBrewer in practice ## # dense scatterplot x <- c(rnorm(5000, sd=3.4, mean=5), rnorm(5000,sd=1.5,mean=-1), rnorm(5000,sd=2.2,mean=11)) y <- c(rnorm(5000, sd=1.6, mean=-2), rnorm(5000,sd=2.5,mean=10), rnorm(5000,sd=1,mean=5)) plot(x,y) # interpolate using KDE2D library(MASS) k <- kde2d(x,y,n=150) # ugly (using heat.colours) image(k)
  64. 64. ## RColorBrewer in practice ## # dense scatterplot x <- c(rnorm(5000, sd=3.4, mean=5), rnorm(5000,sd=1.5,mean=-1), rnorm(5000,sd=2.2,mean=11)) y <- c(rnorm(5000, sd=1.6, mean=-2), rnorm(5000,sd=2.5,mean=10), rnorm(5000,sd=1,mean=5)) plot(x,y) # interpolate using KDE2D library(MASS) k <- kde2d(x,y,n=150) # ugly (using heat.colours) image(k)
  65. 65. # nice (using RColorBrewer) cols <- colorRampPalette(brewer.pal(11,'Spectral'))(256) cols2 <- colorRampPalette(brewer.pal(11,'RdYlGn'))(256) image(k, col=rev(cols)) image(k, col=rev(cols2))
  66. 66. # nice (using RColorBrewer) cols <- colorRampPalette(brewer.pal(11,'Spectral'))(256) cols2 <- colorRampPalette(brewer.pal(11,'RdYlGn'))(256) image(k, col=rev(cols)) image(k, col=rev(cols2))
  67. 67. Exploratory Data Analysis
  68. 68. Exploratory Data Analysis Explanatory Data Analysis
  69. 69. Exploratory Data Analysis Explanatory Data Analysis
  70. 70. Exploratory Data Analysis Explanatory Data Analysis
  71. 71. Exploratory Data Analysis Explanatory Data Analysis
  72. 72. Exploratory Data Analysis Explanatory Data Analysis
  73. 73. ## GGPLOT ## # scatterplots library(ggplot2) x <- rnorm(100) y <- rnorm(100) qplot(x)
  74. 74. ## GGPLOT ## # scatterplots library(ggplot2) x <- rnorm(100) y <- rnorm(100) qplot(x)
  75. 75. ## GGPLOT ## # scatterplots library(ggplot2) x <- rnorm(100) y <- rnorm(100) qplot(x) qplot(x,y)
  76. 76. ## GGPLOT ## # scatterplots library(ggplot2) x <- rnorm(100) y <- rnorm(100) qplot(x) qplot(x,y)
  77. 77. # ggplot objects z <- rnorm(100) d <- data.frame(x,y,z) ggplot(d, aes(x,y))
  78. 78. # ggplot objects z <- rnorm(100) d <- data.frame(x,y,z) ggplot(d, aes(x,y))
  79. 79. # ggplot objects z <- rnorm(100) d <- data.frame(x,y,z) ggplot(d, aes(x,y)) ggplot(d, aes(x,y)) + geom_point()
  80. 80. # ggplot objects z <- rnorm(100) d <- data.frame(x,y,z) ggplot(d, aes(x,y)) ggplot(d, aes(x,y)) + geom_point()
  81. 81. # Scatterplots in ggplot g <- ggplot(mtcars, aes(disp, mpg)) g + geom_point(colour='red', size=5)
  82. 82. # Scatterplots in ggplot g <- ggplot(mtcars, aes(disp, mpg)) g + geom_point(colour='red', size=5) g + geom_point(aes(colour=gear), size=5)
  83. 83. # Scatterplots in ggplot g <- ggplot(mtcars, aes(disp, mpg)) g + geom_point(colour='red', size=5) g + geom_point(aes(colour=gear), size=5)
  84. 84. # Scatterplots in ggplot g <- ggplot(mtcars, aes(disp, mpg)) g + geom_point(colour='red', size=5) g + geom_point(aes(colour=gear), size=5) g + geom_point(aes(colour=factor(gear)), size=5)
  85. 85. # Scatterplots in ggplot g <- ggplot(mtcars, aes(disp, mpg)) g + geom_point(colour='red', size=5) g + geom_point(aes(colour=gear), size=5) g + geom_point(aes(colour=factor(gear)), size=5)
  86. 86. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2
  87. 87. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2
  88. 88. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2 g2 + scale_size(range=c(6,12))
  89. 89. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2 g2 + scale_size(range=c(6,12))
  90. 90. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2 g2 + scale_size(range=c(6,12)) g2 + scale_size(range=c(6,12)) + stat_smooth(method="loess", colour='black')
  91. 91. g2 <- g + geom_point(aes(colour=factor(gear), size=cyl)) g2 g2 + scale_size(range=c(6,12)) g2 + scale_size(range=c(6,12)) + stat_smooth(method="loess", colour='black')
  92. 92. # line graph x <- seq(1, 1000) y <- cumsum(rnorm(1000)) d <- data.frame(x,y) ggplot(d, aes(x,y)) + geom_line()
  93. 93. # line graph x <- seq(1, 1000) y <- cumsum(rnorm(1000)) d <- data.frame(x,y) ggplot(d, aes(x,y)) + geom_line()
  94. 94. # add another line y2 <- cumsum(rnorm(1000)) d <- data.frame(x,y,y2) ggplot(d, aes(x=x)) + geom_line(aes(y=y)) + geom_line(aes(y=y2), colour='red', linetype='dashed')
  95. 95. # add another line y2 <- cumsum(rnorm(1000)) d <- data.frame(x,y,y2) ggplot(d, aes(x=x)) + geom_line(aes(y=y)) + geom_line(aes(y=y2), colour='red', linetype='dashed')
  96. 96. # heat mapping x <- rnorm(5000) y <- rnorm(5000) d <- data.frame(x,y) ggplot(d, aes(x,y)) + stat_bin_2d()
  97. 97. # heat mapping x <- rnorm(5000) y <- rnorm(5000) d <- data.frame(x,y) ggplot(d, aes(x,y)) + stat_bin_2d()
  98. 98. # heat mapping x <- rnorm(5000) y <- rnorm(5000) d <- data.frame(x,y) ggplot(d, aes(x,y)) + stat_bin_2d() ggplot(d, aes(x,y)) + stat_binhex()
  99. 99. # heat mapping x <- rnorm(5000) y <- rnorm(5000) d <- data.frame(x,y) ggplot(d, aes(x,y)) + stat_bin_2d() ggplot(d, aes(x,y)) + stat_binhex()
  100. 100. # density plots calling kde2d ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon')
  101. 101. # density plots calling kde2d ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon')
  102. 102. # density plots calling kde2d ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon') ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon') + scale_fill_distiller(palette='Spectral')
  103. 103. # density plots calling kde2d ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon') ggplot(d, aes(x,y)) + stat_density_2d(aes(fill=..level..),geom='polygon') + scale_fill_distiller(palette='Spectral')
  104. 104. ggmaps
  105. 105. ggmaps
  106. 106. ggmaps
  107. 107. # Interactive visualization
  108. 108. # Interactive visualization
  109. 109. # Interactive visualization
  110. 110. # Interactive visualization
  111. 111. Other possibilities...?
  112. 112. Other possibilities...?
  113. 113. Other possibilities...?
  114. 114. Other possibilities...?
  115. 115. Other possibilities...?
  116. 116. Other possibilities...?
  117. 117. Other possibilities...?
  118. 118. http://www.everydayanalytics.ca

×