Skip to content
Snippets Groups Projects
Commit afd757a2 authored by Michael Armbrust's avatar Michael Armbrust
Browse files

Revert "[SPARK-2410][SQL] Merging Hive Thrift/JDBC server"

This reverts commit 06dc0d2c.

#1399 is making Jenkins fail.  We should investigate and put this back after its passing tests.

Author: Michael Armbrust <michael@databricks.com>

Closes #1594 from marmbrus/revertJDBC and squashes the following commits:

59748da [Michael Armbrust] Revert "[SPARK-2410][SQL] Merging Hive Thrift/JDBC server"
parent 37ad3b72
No related branches found
No related tags found
No related merge requests found
Showing
with 24 additions and 322 deletions
...@@ -57,4 +57,3 @@ metastore_db/ ...@@ -57,4 +57,3 @@ metastore_db/
metastore/ metastore/
warehouse/ warehouse/
TempStatsStore/ TempStatsStore/
sql/hive-thriftserver/test_warehouses
...@@ -165,16 +165,6 @@ ...@@ -165,16 +165,6 @@
</dependency> </dependency>
</dependencies> </dependencies>
</profile> </profile>
<profile>
<id>hive-thriftserver</id>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile> <profile>
<id>spark-ganglia-lgpl</id> <id>spark-ganglia-lgpl</id>
<dependencies> <dependencies>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-bagel_2.10</artifactId> <artifactId>spark-bagel_2.10</artifactId>
<properties> <properties>
<sbt.project.name>bagel</sbt.project.name> <sbt.project.name>bagel</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project Bagel</name> <name>Spark Project Bagel</name>
......
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
# Find the java binary
if [ -n "${JAVA_HOME}" ]; then
RUNNER="${JAVA_HOME}/bin/java"
else
if [ `command -v java` ]; then
RUNNER="java"
else
echo "JAVA_HOME is not set" >&2
exit 1
fi
fi
# Compute classpath using external script
classpath_output=$($FWDIR/bin/compute-classpath.sh)
if [[ "$?" != "0" ]]; then
echo "$classpath_output"
exit 1
else
CLASSPATH=$classpath_output
fi
CLASS="org.apache.hive.beeline.BeeLine"
exec "$RUNNER" -cp "$CLASSPATH" $CLASS "$@"
...@@ -52,7 +52,6 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then ...@@ -52,7 +52,6 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
fi fi
......
...@@ -46,11 +46,11 @@ function main(){ ...@@ -46,11 +46,11 @@ function main(){
# (see https://github.com/sbt/sbt/issues/562). # (see https://github.com/sbt/sbt/issues/562).
stty -icanon min 1 -echo > /dev/null 2>&1 stty -icanon min 1 -echo > /dev/null 2>&1
export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix" export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@" $FWDIR/bin/spark-submit spark-shell "$@" --class org.apache.spark.repl.Main
stty icanon echo > /dev/null 2>&1 stty icanon echo > /dev/null 2>&1
else else
export SPARK_SUBMIT_OPTS export SPARK_SUBMIT_OPTS
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@" $FWDIR/bin/spark-submit spark-shell "$@" --class org.apache.spark.repl.Main
fi fi
} }
......
...@@ -19,4 +19,4 @@ rem ...@@ -19,4 +19,4 @@ rem
set SPARK_HOME=%~dp0.. set SPARK_HOME=%~dp0..
cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell --class org.apache.spark.repl.Main %* cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell %* --class org.apache.spark.repl.Main
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Shell script for starting the Spark SQL CLI
# Enter posix mode for bash
set -o posix
# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
echo "Usage: ./sbin/spark-sql [options]"
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
exit 0
fi
CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
exec "$FWDIR"/bin/spark-submit --class $CLASS spark-internal $@
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId> <artifactId>spark-core_2.10</artifactId>
<properties> <properties>
<sbt.project.name>core</sbt.project.name> <sbt.project.name>core</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project Core</name> <name>Spark Project Core</name>
......
...@@ -46,10 +46,6 @@ object SparkSubmit { ...@@ -46,10 +46,6 @@ object SparkSubmit {
private val CLUSTER = 2 private val CLUSTER = 2
private val ALL_DEPLOY_MODES = CLIENT | CLUSTER private val ALL_DEPLOY_MODES = CLIENT | CLUSTER
// A special jar name that indicates the class being run is inside of Spark itself, and therefore
// no user jar is needed.
private val SPARK_INTERNAL = "spark-internal"
// Special primary resource names that represent shells rather than application jars. // Special primary resource names that represent shells rather than application jars.
private val SPARK_SHELL = "spark-shell" private val SPARK_SHELL = "spark-shell"
private val PYSPARK_SHELL = "pyspark-shell" private val PYSPARK_SHELL = "pyspark-shell"
...@@ -261,9 +257,7 @@ object SparkSubmit { ...@@ -261,9 +257,7 @@ object SparkSubmit {
// In yarn-cluster mode, use yarn.Client as a wrapper around the user class // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
if (clusterManager == YARN && deployMode == CLUSTER) { if (clusterManager == YARN && deployMode == CLUSTER) {
childMainClass = "org.apache.spark.deploy.yarn.Client" childMainClass = "org.apache.spark.deploy.yarn.Client"
if (args.primaryResource != SPARK_INTERNAL) { childArgs += ("--jar", args.primaryResource)
childArgs += ("--jar", args.primaryResource)
}
childArgs += ("--class", args.mainClass) childArgs += ("--class", args.mainClass)
if (args.childArgs != null) { if (args.childArgs != null) {
args.childArgs.foreach { arg => childArgs += ("--arg", arg) } args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
...@@ -338,7 +332,7 @@ object SparkSubmit { ...@@ -338,7 +332,7 @@ object SparkSubmit {
* Return whether the given primary resource represents a user jar. * Return whether the given primary resource represents a user jar.
*/ */
private def isUserJar(primaryResource: String): Boolean = { private def isUserJar(primaryResource: String): Boolean = {
!isShell(primaryResource) && !isPython(primaryResource) && !isInternal(primaryResource) !isShell(primaryResource) && !isPython(primaryResource)
} }
/** /**
...@@ -355,10 +349,6 @@ object SparkSubmit { ...@@ -355,10 +349,6 @@ object SparkSubmit {
primaryResource.endsWith(".py") || primaryResource == PYSPARK_SHELL primaryResource.endsWith(".py") || primaryResource == PYSPARK_SHELL
} }
private[spark] def isInternal(primaryResource: String): Boolean = {
primaryResource == SPARK_INTERNAL
}
/** /**
* Merge a sequence of comma-separated file lists, some of which may be null to indicate * Merge a sequence of comma-separated file lists, some of which may be null to indicate
* no files, into a single comma-separated string. * no files, into a single comma-separated string.
......
...@@ -204,9 +204,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { ...@@ -204,9 +204,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
/** Fill in values by parsing user options. */ /** Fill in values by parsing user options. */
private def parseOpts(opts: Seq[String]): Unit = { private def parseOpts(opts: Seq[String]): Unit = {
var inSparkOpts = true
// Delineates parsing of Spark options from parsing of user options. // Delineates parsing of Spark options from parsing of user options.
var inSparkOpts = true
parse(opts) parse(opts)
def parse(opts: Seq[String]): Unit = opts match { def parse(opts: Seq[String]): Unit = opts match {
...@@ -319,7 +318,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { ...@@ -319,7 +318,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
SparkSubmit.printErrorAndExit(errMessage) SparkSubmit.printErrorAndExit(errMessage)
case v => case v =>
primaryResource = primaryResource =
if (!SparkSubmit.isShell(v) && !SparkSubmit.isInternal(v)) { if (!SparkSubmit.isShell(v)) {
Utils.resolveURI(v).toString Utils.resolveURI(v).toString
} else { } else {
v v
......
...@@ -53,7 +53,7 @@ if [[ ! "$@" =~ --package-only ]]; then ...@@ -53,7 +53,7 @@ if [[ ! "$@" =~ --package-only ]]; then
-Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \ -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
-Dmaven.javadoc.skip=true \ -Dmaven.javadoc.skip=true \
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \ -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl\ -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl\
-Dtag=$GIT_TAG -DautoVersionSubmodules=true \ -Dtag=$GIT_TAG -DautoVersionSubmodules=true \
--batch-mode release:prepare --batch-mode release:prepare
...@@ -61,7 +61,7 @@ if [[ ! "$@" =~ --package-only ]]; then ...@@ -61,7 +61,7 @@ if [[ ! "$@" =~ --package-only ]]; then
-Darguments="-DskipTests=true -Dmaven.javadoc.skip=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \ -Darguments="-DskipTests=true -Dmaven.javadoc.skip=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \ -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-Dmaven.javadoc.skip=true \ -Dmaven.javadoc.skip=true \
-Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl\ -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl\
release:perform release:perform
cd .. cd ..
...@@ -111,10 +111,10 @@ make_binary_release() { ...@@ -111,10 +111,10 @@ make_binary_release() {
spark-$RELEASE_VERSION-bin-$NAME.tgz.sha spark-$RELEASE_VERSION-bin-$NAME.tgz.sha
} }
make_binary_release "hadoop1" "-Phive -Phive-thriftserver -Dhadoop.version=1.0.4" make_binary_release "hadoop1" "-Phive -Dhadoop.version=1.0.4"
make_binary_release "cdh4" "-Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" make_binary_release "cdh4" "-Phive -Dhadoop.version=2.0.0-mr1-cdh4.2.0"
make_binary_release "hadoop2" \ make_binary_release "hadoop2" \
"-Phive -Phive-thriftserver -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -Pyarn.version=2.2.0" "-Phive -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -Pyarn.version=2.2.0"
# Copy data # Copy data
echo "Copying release tarballs" echo "Copying release tarballs"
......
...@@ -65,7 +65,7 @@ echo "=========================================================================" ...@@ -65,7 +65,7 @@ echo "========================================================================="
# (either resolution or compilation) prompts the user for input either q, r, # (either resolution or compilation) prompts the user for input either q, r,
# etc to quit or retry. This echo is there to make it not block. # etc to quit or retry. This echo is there to make it not block.
if [ -n "$_RUN_SQL_TESTS" ]; then if [ -n "$_RUN_SQL_TESTS" ]; then
echo -e "q\n" | SBT_MAVEN_PROFILES="$SBT_MAVEN_PROFILES -Phive -Phive-thriftserver" sbt/sbt clean package \ echo -e "q\n" | SBT_MAVEN_PROFILES="$SBT_MAVEN_PROFILES -Phive" sbt/sbt clean package \
assembly/assembly test | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" assembly/assembly test | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
else else
echo -e "q\n" | sbt/sbt clean package assembly/assembly test | \ echo -e "q\n" | sbt/sbt clean package assembly/assembly test | \
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
# limitations under the License. # limitations under the License.
# #
echo -e "q\n" | sbt/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt echo -e "q\n" | sbt/sbt -Phive scalastyle > scalastyle.txt
# Check style with YARN alpha built too # Check style with YARN alpha built too
echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-0.23 -Dhadoop.version=0.23.9 yarn-alpha/scalastyle \ echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-0.23 -Dhadoop.version=0.23.9 yarn-alpha/scalastyle \
>> scalastyle.txt >> scalastyle.txt
......
...@@ -136,7 +136,7 @@ val sqlContext = new org.apache.spark.sql.SQLContext(sc) ...@@ -136,7 +136,7 @@ val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.createSchemaRDD import sqlContext.createSchemaRDD
// Define the schema using a case class. // Define the schema using a case class.
// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit, // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
// you can use custom classes that implement the Product interface. // you can use custom classes that implement the Product interface.
case class Person(name: String, age: Int) case class Person(name: String, age: Int)
...@@ -548,6 +548,7 @@ results = hiveContext.hql("FROM src SELECT key, value").collect() ...@@ -548,6 +548,7 @@ results = hiveContext.hql("FROM src SELECT key, value").collect()
</div> </div>
</div> </div>
# Writing Language-Integrated Relational Queries # Writing Language-Integrated Relational Queries
**Language-Integrated queries are currently only supported in Scala.** **Language-Integrated queries are currently only supported in Scala.**
...@@ -572,199 +573,4 @@ prefixed with a tick (`'`). Implicit conversions turn these symbols into expres ...@@ -572,199 +573,4 @@ prefixed with a tick (`'`). Implicit conversions turn these symbols into expres
evaluated by the SQL execution engine. A full list of the functions supported can be found in the evaluated by the SQL execution engine. A full list of the functions supported can be found in the
[ScalaDoc](api/scala/index.html#org.apache.spark.sql.SchemaRDD). [ScalaDoc](api/scala/index.html#org.apache.spark.sql.SchemaRDD).
<!-- TODO: Include the table of operations here. --> <!-- TODO: Include the table of operations here. -->
\ No newline at end of file
## Running the Thrift JDBC server
The Thrift JDBC server implemented here corresponds to the [`HiveServer2`]
(https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2) in Hive 0.12. You can test
the JDBC server with the beeline script comes with either Spark or Hive 0.12. In order to use Hive
you must first run '`sbt/sbt -Phive-thriftserver assembly/assembly`' (or use `-Phive-thriftserver`
for maven).
To start the JDBC server, run the following in the Spark directory:
./sbin/start-thriftserver.sh
The default port the server listens on is 10000. You may run
`./sbin/start-thriftserver.sh --help` for a complete list of all available
options. Now you can use beeline to test the Thrift JDBC server:
./bin/beeline
Connect to the JDBC server in beeline with:
beeline> !connect jdbc:hive2://localhost:10000
Beeline will ask you for a username and password. In non-secure mode, simply enter the username on
your machine and a blank password. For secure mode, please follow the instructions given in the
[beeline documentation](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients)
Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
You may also use the beeline script comes with Hive.
### Migration Guide for Shark Users
#### Reducer number
In Shark, default reducer number is 1 and is controlled by the property `mapred.reduce.tasks`. Spark
SQL deprecates this property by a new property `spark.sql.shuffle.partitions`, whose default value
is 200. Users may customize this property via `SET`:
```
SET spark.sql.shuffle.partitions=10;
SELECT page, count(*) c FROM logs_last_month_cached
GROUP BY page ORDER BY c DESC LIMIT 10;
```
You may also put this property in `hive-site.xml` to override the default value.
For now, the `mapred.reduce.tasks` property is still recognized, and is converted to
`spark.sql.shuffle.partitions` automatically.
#### Caching
The `shark.cache` table property no longer exists, and tables whose name end with `_cached` are no
longer automcatically cached. Instead, we provide `CACHE TABLE` and `UNCACHE TABLE` statements to
let user control table caching explicitly:
```
CACHE TABLE logs_last_month;
UNCACHE TABLE logs_last_month;
```
**NOTE** `CACHE TABLE tbl` is lazy, it only marks table `tbl` as "need to by cached if necessary",
but doesn't actually cache it until a query that touches `tbl` is executed. To force the table to be
cached, you may simply count the table immediately after executing `CACHE TABLE`:
```
CACHE TABLE logs_last_month;
SELECT COUNT(1) FROM logs_last_month;
```
Several caching related features are not supported yet:
* User defined partition level cache eviction policy
* RDD reloading
* In-memory cache write through policy
### Compatibility with Apache Hive
#### Deploying in Exising Hive Warehouses
Spark SQL Thrift JDBC server is designed to be "out of the box" compatible with existing Hive
installations. You do not need to modify your existing Hive Metastore or change the data placement
or partitioning of your tables.
#### Supported Hive Features
Spark SQL supports the vast majority of Hive features, such as:
* Hive query statements, including:
* `SELECT`
* `GROUP BY
* `ORDER BY`
* `CLUSTER BY`
* `SORT BY`
* All Hive operators, including:
* Relational operators (`=`, ``, `==`, `<>`, `<`, `>`, `>=`, `<=`, etc)
* Arthimatic operators (`+`, `-`, `*`, `/`, `%`, etc)
* Logical operators (`AND`, `&&`, `OR`, `||`, etc)
* Complex type constructors
* Mathemtatical functions (`sign`, `ln`, `cos`, etc)
* String functions (`instr`, `length`, `printf`, etc)
* User defined functions (UDF)
* User defined aggregation functions (UDAF)
* User defined serialization formats (SerDe's)
* Joins
* `JOIN`
* `{LEFT|RIGHT|FULL} OUTER JOIN`
* `LEFT SEMI JOIN`
* `CROSS JOIN`
* Unions
* Sub queries
* `SELECT col FROM ( SELECT a + b AS col from t1) t2`
* Sampling
* Explain
* Partitioned tables
* All Hive DDL Functions, including:
* `CREATE TABLE`
* `CREATE TABLE AS SELECT`
* `ALTER TABLE`
* Most Hive Data types, including:
* `TINYINT`
* `SMALLINT`
* `INT`
* `BIGINT`
* `BOOLEAN`
* `FLOAT`
* `DOUBLE`
* `STRING`
* `BINARY`
* `TIMESTAMP`
* `ARRAY<>`
* `MAP<>`
* `STRUCT<>`
#### Unsupported Hive Functionality
Below is a list of Hive features that we don't support yet. Most of these features are rarely used
in Hive deployments.
**Major Hive Features**
* Tables with buckets: bucket is the hash partitioning within a Hive table partition. Spark SQL
doesn't support buckets yet.
**Esoteric Hive Features**
* Tables with partitions using different input formats: In Spark SQL, all table partitions need to
have the same input format.
* Non-equi outer join: For the uncommon use case of using outer joins with non-equi join conditions
(e.g. condition "`key < 10`"), Spark SQL will output wrong result for the `NULL` tuple.
* `UNIONTYPE`
* Unique join
* Single query multi insert
* Column statistics collecting: Spark SQL does not piggyback scans to collect column statistics at
the moment.
**Hive Input/Output Formats**
* File format for CLI: For results showing back to the CLI, Spark SQL only supports TextOutputFormat.
* Hadoop archive
**Hive Optimizations**
A handful of Hive optimizations are not yet included in Spark. Some of these (such as indexes) are
not necessary due to Spark SQL's in-memory computational model. Others are slotted for future
releases of Spark SQL.
* Block level bitmap indexes and virtual columns (used to build indexes)
* Automatically convert a join to map join: For joining a large table with multiple small tables,
Hive automatically converts the join into a map join. We are adding this auto conversion in the
next release.
* Automatically determine the number of reducers for joins and groupbys: Currently in Spark SQL, you
need to control the degree of parallelism post-shuffle using "SET
spark.sql.shuffle.partitions=[num_tasks];". We are going to add auto-setting of parallelism in the
next release.
* Meta-data only query: For queries that can be answered by using only meta data, Spark SQL still
launches tasks to compute the result.
* Skew data flag: Spark SQL does not follow the skew data flags in Hive.
* `STREAMTABLE` hint in join: Spark SQL does not follow the `STREAMTABLE` hint.
* Merge multiple small files for query results: if the result output contains multiple small files,
Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
metadata. Spark SQL does not support that.
## Running the Spark SQL CLI
The Spark SQL CLI is a convenient tool to run the Hive metastore service in local mode and execute
queries input from command line. Note: the Spark SQL CLI cannot talk to the Thrift JDBC server.
To start the Spark SQL CLI, run the following in the Spark directory:
./bin/spark-sql
Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
You may run `./bin/spark-sql --help` for a complete list of all available
options.
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-examples_2.10</artifactId> <artifactId>spark-examples_2.10</artifactId>
<properties> <properties>
<sbt.project.name>examples</sbt.project.name> <sbt.project.name>examples</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project Examples</name> <name>Spark Project Examples</name>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-flume_2.10</artifactId> <artifactId>spark-streaming-flume_2.10</artifactId>
<properties> <properties>
<sbt.project.name>streaming-flume</sbt.project.name> <sbt.project.name>streaming-flume</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project External Flume</name> <name>Spark Project External Flume</name>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId> <artifactId>spark-streaming-kafka_2.10</artifactId>
<properties> <properties>
<sbt.project.name>streaming-kafka</sbt.project.name> <sbt.project.name>streaming-kafka</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project External Kafka</name> <name>Spark Project External Kafka</name>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-mqtt_2.10</artifactId> <artifactId>spark-streaming-mqtt_2.10</artifactId>
<properties> <properties>
<sbt.project.name>streaming-mqtt</sbt.project.name> <sbt.project.name>streaming-mqtt</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project External MQTT</name> <name>Spark Project External MQTT</name>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-twitter_2.10</artifactId> <artifactId>spark-streaming-twitter_2.10</artifactId>
<properties> <properties>
<sbt.project.name>streaming-twitter</sbt.project.name> <sbt.project.name>streaming-twitter</sbt.project.name>
</properties> </properties>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>Spark Project External Twitter</name> <name>Spark Project External Twitter</name>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment