SparkSQLの構文解析
dive into SparkSQL
株式会社サイバーエージェント 井上ゆり
@iyunoriue
http://x1.inkenkun.com/
sqlContext.sql("SELECT * FROM records")
sqlContext.sql("SELECT * FROM records")
def sql(sqlText: String): DataFrame = {

DataFrame(this, parseSql(sqlText))

}
図示するとこんな感じ
DataFrame
sqlContext
parseSql:
LogicalPlan
parseSql: LogicalPlan
sqlText:String型 LogicalPlan型
protected[sql] def parseSql(sql: String): LogicalPlan =

ddlParser.parse(sql, false)

!


@transient

protected[sql] val ddlParser =

new DDLParser(sqlParser.parse(_))

!


@transient

protected[sql] val sqlParser =
new SparkSQLParser(getSQLDialect().parse(_))
!
!
protected[sql] def getSQLDialect(): ParserDialect = {
try {
val clazz = Utils.classForName(dialectClassName)
clazz.newInstance().asInstanceOf[ParserDialect]
} catch {
case NonFatal(e) =>
:
}
}
DDLParser
SparkSQLParser
ParserDialect
sql
⇒org.apache.spark.sql.catalyst.DefaultParserDialect
hiveql
⇒org.apache.spark.sql.hive.HiveQLDialect
方言、訛り
図示するとこんな感じ
LogicalPlan
LogicalPlan
LogicalPlan
DDLParser.parse
パース成功
パース失敗
SparkSQLParser.parse
パース成功
パース失敗
ParserDialect.parse
DDLParser
SparkSQLParser
!
} AbstractSparkSQLParser
のサブクラス
※ただし、DDLParserはAbstractSparkSQLParserのparseをオーバー
ライドしていますが、SparkSQLParserはAbstractSparkSQLParserの
parseをそのまま使っているため、parseの実装は異なります。
private[sql] abstract class AbstractSparkSQLParser
extends StandardTokenParsers with PackratParsers {
def parse(input: String): LogicalPlan = {
// Initialize the Keywords.
lexical.initialize(reservedWords)
phrase(start)(new lexical.Scanner(input)) match {
case Success(plan, _) => plan
case failureOrError =>
sys.error(failureOrError.toString)
}
}
:
}
今度はこの2つを
見てみます。
class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
protected val ABS = Keyword(“ABS")
protected val ALL = Keyword(“ALL”)
:
protected val WHEN = Keyword(“WHEN")
protected val WHERE = Keyword(“WHERE")
protected val WITH = Keyword("WITH")
!
protected lazy val start: Parser[LogicalPlan] =
start1 | insert | cte
!
protected lazy val start1: Parser[LogicalPlan] =
(select | ("(" ~> select <~ ")")) *
( UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) }
| INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) }
| EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)}
| UNION ~ DISTINCT.? ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Distinct(Union(q1, q2)) }
)
!
protected lazy val select: Parser[LogicalPlan] =
SELECT ~> DISTINCT.? ~
repsep(projection, ",") ~
(FROM ~> relations).? ~
(WHERE ~> expression).? ~
(GROUP ~ BY ~> rep1sep(expression, ",")).? ~
(HAVING ~> expression).? ~
sortType.? ~
(LIMIT ~> expression).? ^^ {
case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l =>
val base = r.getOrElse(OneRowRelation)
val withFilter = f.map(Filter(_, base)).getOrElse(base)
val withProjection = g
.map(Aggregate(_, assignAliases(p), withFilter))
.getOrElse(Project(assignAliases(p), withFilter))
val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection)
val withHaving = h.map(Filter(_, withDistinct)).getOrElse(withDistinct)
val withOrder = o.map(_(withHaving)).getOrElse(withHaving)
val withLimit = l.map(Limit(_, withOrder)).getOrElse(withOrder)
withLimit
}
!
protected lazy val insert: Parser[LogicalPlan] =
INSERT ~> (OVERWRITE ^^^ true | INTO ^^^ false) ~ (TABLE ~> relation) ~ select ^^ {
case o ~ r ~ s => InsertIntoTable(r, Map.empty[String, Option[String]], s, o, false)
}
:
}
class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
protected val ABS = Keyword(“ABS")
protected val ALL = Keyword(“ALL”)
:
protected val WHEN = Keyword(“WHEN")
protected val WHERE = Keyword(“WHERE")
protected val WITH = Keyword("WITH")
!
protected lazy val start: Parser[LogicalPlan] =
start1 | insert | cte
!
protected lazy val start1: Parser[LogicalPlan] =
(select | ("(" ~> select <~ ")")) *
( UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) }
| INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) }
| EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)}
| UNION ~ DISTINCT.? ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Distinct(Union(q1, q2)) }
)
!
protected lazy val select: Parser[LogicalPlan] =
SELECT ~> DISTINCT.? ~
repsep(projection, ",") ~
(FROM ~> relations).? ~
(WHERE ~> expression).? ~
(GROUP ~ BY ~> rep1sep(expression, ",")).? ~
(HAVING ~> expression).? ~
sortType.? ~
(LIMIT ~> expression).? ^^ {
case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l =>
val base = r.getOrElse(OneRowRelation)
val withFilter = f.map(Filter(_, base)).getOrElse(base)
val withProjection = g
.map(Aggregate(_, assignAliases(p), withFilter))
.getOrElse(Project(assignAliases(p), withFilter))
val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection)
val withHaving = h.map(Filter(_, withDistinct)).getOrElse(withDistinct)
val withOrder = o.map(_(withHaving)).getOrElse(withHaving)
val withLimit = l.map(Limit(_, withOrder)).getOrElse(withOrder)
withLimit
}
!
protected lazy val insert: Parser[LogicalPlan] =
INSERT ~> (OVERWRITE ^^^ true | INTO ^^^ false) ~ (TABLE ~> relation) ~ select ^^ {
case o ~ r ~ s => InsertIntoTable(r, Map.empty[String, Option[String]], s, o, false)
}
:
}
そうです。おなじみの
パーサー・コンビネータです。
Project Tungsten
クエリ構文解析の一部をパーサー・コンビネータ
からRuntime Code Generationに変更。
なぜ?
パーサー・コンビネータは決して早くない
(計算量:O(n^k))
quasiquotesを使ってruntimeにコードを生成
Deep Dive into Spark SQL’s Catalyst Optimizer:
https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html
def compile(node: Node): AST = node match {

case Literal(value) => q"$value"

case Attribute(name) => q"row.get($name)"

case Add(left, right) => q"${compile(left)} + ${compile(right)}"

}

SparkSQLの構文解析

  • 1.
  • 2.
  • 3.
  • 4.
    def sql(sqlText: String):DataFrame = {
 DataFrame(this, parseSql(sqlText))
 }
  • 5.
  • 6.
  • 7.
    protected[sql] def parseSql(sql:String): LogicalPlan =
 ddlParser.parse(sql, false)
 ! 
 @transient
 protected[sql] val ddlParser =
 new DDLParser(sqlParser.parse(_))
 ! 
 @transient
 protected[sql] val sqlParser = new SparkSQLParser(getSQLDialect().parse(_)) ! ! protected[sql] def getSQLDialect(): ParserDialect = { try { val clazz = Utils.classForName(dialectClassName) clazz.newInstance().asInstanceOf[ParserDialect] } catch { case NonFatal(e) => : } }
  • 8.
  • 9.
  • 10.
  • 11.
    private[sql] abstract classAbstractSparkSQLParser extends StandardTokenParsers with PackratParsers { def parse(input: String): LogicalPlan = { // Initialize the Keywords. lexical.initialize(reservedWords) phrase(start)(new lexical.Scanner(input)) match { case Success(plan, _) => plan case failureOrError => sys.error(failureOrError.toString) } } : } 今度はこの2つを 見てみます。
  • 12.
    class SqlParser extendsAbstractSparkSQLParser with DataTypeParser { protected val ABS = Keyword(“ABS") protected val ALL = Keyword(“ALL”) : protected val WHEN = Keyword(“WHEN") protected val WHERE = Keyword(“WHERE") protected val WITH = Keyword("WITH") ! protected lazy val start: Parser[LogicalPlan] = start1 | insert | cte ! protected lazy val start1: Parser[LogicalPlan] = (select | ("(" ~> select <~ ")")) * ( UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) } | INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) } | EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)} | UNION ~ DISTINCT.? ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Distinct(Union(q1, q2)) } ) ! protected lazy val select: Parser[LogicalPlan] = SELECT ~> DISTINCT.? ~ repsep(projection, ",") ~ (FROM ~> relations).? ~ (WHERE ~> expression).? ~ (GROUP ~ BY ~> rep1sep(expression, ",")).? ~ (HAVING ~> expression).? ~ sortType.? ~ (LIMIT ~> expression).? ^^ { case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l => val base = r.getOrElse(OneRowRelation) val withFilter = f.map(Filter(_, base)).getOrElse(base) val withProjection = g .map(Aggregate(_, assignAliases(p), withFilter)) .getOrElse(Project(assignAliases(p), withFilter)) val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection) val withHaving = h.map(Filter(_, withDistinct)).getOrElse(withDistinct) val withOrder = o.map(_(withHaving)).getOrElse(withHaving) val withLimit = l.map(Limit(_, withOrder)).getOrElse(withOrder) withLimit } ! protected lazy val insert: Parser[LogicalPlan] = INSERT ~> (OVERWRITE ^^^ true | INTO ^^^ false) ~ (TABLE ~> relation) ~ select ^^ { case o ~ r ~ s => InsertIntoTable(r, Map.empty[String, Option[String]], s, o, false) } : }
  • 13.
    class SqlParser extendsAbstractSparkSQLParser with DataTypeParser { protected val ABS = Keyword(“ABS") protected val ALL = Keyword(“ALL”) : protected val WHEN = Keyword(“WHEN") protected val WHERE = Keyword(“WHERE") protected val WITH = Keyword("WITH") ! protected lazy val start: Parser[LogicalPlan] = start1 | insert | cte ! protected lazy val start1: Parser[LogicalPlan] = (select | ("(" ~> select <~ ")")) * ( UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) } | INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) } | EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)} | UNION ~ DISTINCT.? ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Distinct(Union(q1, q2)) } ) ! protected lazy val select: Parser[LogicalPlan] = SELECT ~> DISTINCT.? ~ repsep(projection, ",") ~ (FROM ~> relations).? ~ (WHERE ~> expression).? ~ (GROUP ~ BY ~> rep1sep(expression, ",")).? ~ (HAVING ~> expression).? ~ sortType.? ~ (LIMIT ~> expression).? ^^ { case d ~ p ~ r ~ f ~ g ~ h ~ o ~ l => val base = r.getOrElse(OneRowRelation) val withFilter = f.map(Filter(_, base)).getOrElse(base) val withProjection = g .map(Aggregate(_, assignAliases(p), withFilter)) .getOrElse(Project(assignAliases(p), withFilter)) val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection) val withHaving = h.map(Filter(_, withDistinct)).getOrElse(withDistinct) val withOrder = o.map(_(withHaving)).getOrElse(withHaving) val withLimit = l.map(Limit(_, withOrder)).getOrElse(withOrder) withLimit } ! protected lazy val insert: Parser[LogicalPlan] = INSERT ~> (OVERWRITE ^^^ true | INTO ^^^ false) ~ (TABLE ~> relation) ~ select ^^ { case o ~ r ~ s => InsertIntoTable(r, Map.empty[String, Option[String]], s, o, false) } : } そうです。おなじみの パーサー・コンビネータです。
  • 14.
    Project Tungsten クエリ構文解析の一部をパーサー・コンビネータ からRuntime CodeGenerationに変更。 なぜ? パーサー・コンビネータは決して早くない (計算量:O(n^k)) quasiquotesを使ってruntimeにコードを生成 Deep Dive into Spark SQL’s Catalyst Optimizer: https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html
  • 15.
    def compile(node: Node):AST = node match {
 case Literal(value) => q"$value"
 case Attribute(name) => q"row.get($name)"
 case Add(left, right) => q"${compile(left)} + ${compile(right)}"
 }