Skip to content
Snippets Groups Projects
Commit 45d8f4de authored by Yin Huai's avatar Yin Huai Committed by Michael Armbrust
Browse files

[SPARK-2919] [SQL] Basic support for analyze command in HiveQl

The command we will support is
```
ANALYZE TABLE tablename COMPUTE STATISTICS noscan
```
Other cases shown in https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables will still be treated as Hive native commands.

JIRA: https://issues.apache.org/jira/browse/SPARK-2919

Author: Yin Huai <huai@cse.ohio-state.edu>

Closes #1848 from yhuai/sqlAnalyze and squashes the following commits:

0b79d36 [Yin Huai] Typo and format.
c59d94b [Yin Huai] Support "ANALYZE TABLE tableName COMPUTE STATISTICS noscan".
parent c874723f
No related branches found
No related tags found
No related merge requests found
...@@ -46,6 +46,8 @@ private[hive] case class AddFile(filePath: String) extends Command ...@@ -46,6 +46,8 @@ private[hive] case class AddFile(filePath: String) extends Command
private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command
private[hive] case class AnalyzeTable(tableName: String) extends Command
/** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */ /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
private[hive] object HiveQl { private[hive] object HiveQl {
protected val nativeCommands = Seq( protected val nativeCommands = Seq(
...@@ -74,7 +76,6 @@ private[hive] object HiveQl { ...@@ -74,7 +76,6 @@ private[hive] object HiveQl {
"TOK_CREATEFUNCTION", "TOK_CREATEFUNCTION",
"TOK_DROPFUNCTION", "TOK_DROPFUNCTION",
"TOK_ANALYZE",
"TOK_ALTERDATABASE_PROPERTIES", "TOK_ALTERDATABASE_PROPERTIES",
"TOK_ALTERINDEX_PROPERTIES", "TOK_ALTERINDEX_PROPERTIES",
"TOK_ALTERINDEX_REBUILD", "TOK_ALTERINDEX_REBUILD",
...@@ -92,7 +93,6 @@ private[hive] object HiveQl { ...@@ -92,7 +93,6 @@ private[hive] object HiveQl {
"TOK_ALTERTABLE_SKEWED", "TOK_ALTERTABLE_SKEWED",
"TOK_ALTERTABLE_TOUCH", "TOK_ALTERTABLE_TOUCH",
"TOK_ALTERTABLE_UNARCHIVE", "TOK_ALTERTABLE_UNARCHIVE",
"TOK_ANALYZE",
"TOK_CREATEDATABASE", "TOK_CREATEDATABASE",
"TOK_CREATEFUNCTION", "TOK_CREATEFUNCTION",
"TOK_CREATEINDEX", "TOK_CREATEINDEX",
...@@ -239,7 +239,6 @@ private[hive] object HiveQl { ...@@ -239,7 +239,6 @@ private[hive] object HiveQl {
ShellCommand(sql.drop(1)) ShellCommand(sql.drop(1))
} else { } else {
val tree = getAst(sql) val tree = getAst(sql)
if (nativeCommands contains tree.getText) { if (nativeCommands contains tree.getText) {
NativeCommand(sql) NativeCommand(sql)
} else { } else {
...@@ -387,6 +386,22 @@ private[hive] object HiveQl { ...@@ -387,6 +386,22 @@ private[hive] object HiveQl {
ifExists) => ifExists) =>
val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".") val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
DropTable(tableName, ifExists.nonEmpty) DropTable(tableName, ifExists.nonEmpty)
// Support "ANALYZE TABLE tableNmae COMPUTE STATISTICS noscan"
case Token("TOK_ANALYZE",
Token("TOK_TAB", Token("TOK_TABNAME", tableNameParts) :: partitionSpec) ::
isNoscan) =>
// Reference:
// https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables
if (partitionSpec.nonEmpty) {
// Analyze partitions will be treated as a Hive native command.
NativePlaceholder
} else if (isNoscan.isEmpty) {
// If users do not specify "noscan", it will be treated as a Hive native command.
NativePlaceholder
} else {
val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
AnalyzeTable(tableName)
}
// Just fake explain for any of the native commands. // Just fake explain for any of the native commands.
case Token("TOK_EXPLAIN", explainArgs) case Token("TOK_EXPLAIN", explainArgs)
if noExplainCommands.contains(explainArgs.head.getText) => if noExplainCommands.contains(explainArgs.head.getText) =>
......
...@@ -83,6 +83,8 @@ private[hive] trait HiveStrategies { ...@@ -83,6 +83,8 @@ private[hive] trait HiveStrategies {
case DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil case DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil
case AnalyzeTable(tableName) => execution.AnalyzeTable(tableName) :: Nil
case describe: logical.DescribeCommand => case describe: logical.DescribeCommand =>
val resolvedTable = context.executePlan(describe.table).analyzed val resolvedTable = context.executePlan(describe.table).analyzed
resolvedTable match { resolvedTable match {
......
...@@ -23,6 +23,32 @@ import org.apache.spark.sql.catalyst.expressions.Row ...@@ -23,6 +23,32 @@ import org.apache.spark.sql.catalyst.expressions.Row
import org.apache.spark.sql.execution.{Command, LeafNode} import org.apache.spark.sql.execution.{Command, LeafNode}
import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.HiveContext
/**
* :: DeveloperApi ::
* Analyzes the given table in the current database to generate statistics, which will be
* used in query optimizations.
*
* Right now, it only supports Hive tables and it only updates the size of a Hive table
* in the Hive metastore.
*/
@DeveloperApi
case class AnalyzeTable(tableName: String) extends LeafNode with Command {
def hiveContext = sqlContext.asInstanceOf[HiveContext]
def output = Seq.empty
override protected[sql] lazy val sideEffectResult = {
hiveContext.analyze(tableName)
Seq.empty[Any]
}
override def execute(): RDD[Row] = {
sideEffectResult
sparkContext.emptyRDD[Row]
}
}
/** /**
* :: DeveloperApi :: * :: DeveloperApi ::
* Drops a table from the metastore and removes it if it is cached. * Drops a table from the metastore and removes it if it is cached.
......
...@@ -19,13 +19,54 @@ package org.apache.spark.sql.hive ...@@ -19,13 +19,54 @@ package org.apache.spark.sql.hive
import scala.reflect.ClassTag import scala.reflect.ClassTag
import org.apache.spark.sql.{SQLConf, QueryTest} import org.apache.spark.sql.{SQLConf, QueryTest}
import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin} import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive._
class StatisticsSuite extends QueryTest { class StatisticsSuite extends QueryTest {
test("parse analyze commands") {
def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
val parsed = HiveQl.parseSql(analyzeCommand)
val operators = parsed.collect {
case a: AnalyzeTable => a
case o => o
}
assert(operators.size === 1)
if (operators(0).getClass() != c) {
fail(
s"""$analyzeCommand expected command: $c, but got ${operators(0)}
|parsed command:
|$parsed
""".stripMargin)
}
}
assertAnalyzeCommand(
"ANALYZE TABLE Table1 COMPUTE STATISTICS",
classOf[NativeCommand])
assertAnalyzeCommand(
"ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
classOf[NativeCommand])
assertAnalyzeCommand(
"ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
classOf[NativeCommand])
assertAnalyzeCommand(
"ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
classOf[NativeCommand])
assertAnalyzeCommand(
"ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
classOf[NativeCommand])
assertAnalyzeCommand(
"ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
classOf[AnalyzeTable])
}
test("analyze MetastoreRelations") { test("analyze MetastoreRelations") {
def queryTotalSize(tableName: String): BigInt = def queryTotalSize(tableName: String): BigInt =
catalog.lookupRelation(None, tableName).statistics.sizeInBytes catalog.lookupRelation(None, tableName).statistics.sizeInBytes
...@@ -37,7 +78,7 @@ class StatisticsSuite extends QueryTest { ...@@ -37,7 +78,7 @@ class StatisticsSuite extends QueryTest {
assert(queryTotalSize("analyzeTable") === defaultSizeInBytes) assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
analyze("analyzeTable") sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
assert(queryTotalSize("analyzeTable") === BigInt(11624)) assert(queryTotalSize("analyzeTable") === BigInt(11624))
...@@ -66,7 +107,7 @@ class StatisticsSuite extends QueryTest { ...@@ -66,7 +107,7 @@ class StatisticsSuite extends QueryTest {
assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes) assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
analyze("analyzeTable_part") sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
assert(queryTotalSize("analyzeTable_part") === BigInt(17436)) assert(queryTotalSize("analyzeTable_part") === BigInt(17436))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment