117. // Parquet DataFrame
val df = sqlContext.read.parquet("people.parquet")
//
df.registerTempTable("people")
// DataFrame stdout
sqlContext.sql("SELECT * FROM people").show()
// Schema
sqlContext.sql("SELECT * FROM people").printSchema()
// "name"
sqlContext.sql("SELECT name FROM people").show()
// "age" +1
sqlContext.sql("SELECT name, (age + 1) as age FROM people").show()
// 21 people
sqlContext.sql("SELECT * FROM people WHERE age > 21").show()
// age count
sqlContext.sql("SELECT age, count(age) as count FROM people Group By age").show()
7 . 7
122. // case class Schema
case class Person(name: String, age: Int)
// DataFrame
val df = sc.textFile("people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).t
df.registerTempTable("people")
val teenagers = sqlContext.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
// SQL query DataFrame, normal RDD operation
teenagers.map( t => "Name: " + t(0)).collect().foreach(println)
//
teenagers.map (t => "Name: " + t.getAs[String]("name")).collect().foreach(println)
// row.getValueMap[T] Map[String, T]
teenagers.map (_.getValuesMap[Any](List("name", "age"))).collect().foreach(println)
7 . 8
156. import com.databricks.spark.csv
// 3rd party library "CSV" Dataframe
val df = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true") // Use first line of all files as header
.option("inferSchema", "true") // Automatically infer data types
.load("sfpd.csv")
// Schema
df.printSchema
// Distinct Category
df.select("Category").distinct().collect().foreach(println)
// temp table
df.registerTempTable("sfpd")
//
sqlContext.sql("SELECT distinct Category FROM sfpd").collect().foreach(println)
// Top 10
sqlContext.sql("SELECT Resolution , count(Resolution) as rescount FROM sfpd group by Resolutio
// Top 10
sqlContext.sql("SELECT Category , count(Category) as catcount FROM sfpd group by Category orde
8 . 6