diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index c187869fdf121c550aef97e6487560d489d11b3c..04a8b1e1f395216ef2b817b5f4a5e57775b2e80e 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -103,8 +103,7 @@ sparkR.stop <- function() { #' list(spark.executor.memory="4g"), #' list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"), #' c("one.jar", "two.jar", "three.jar"), -#' c("com.databricks:spark-avro_2.10:2.0.1", -#' "com.databricks:spark-csv_2.10:1.3.0")) +#' c("com.databricks:spark-avro_2.10:2.0.1")) #'} sparkR.init <- function( diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R index 28276a020df916627114bbbd9a1900b6c0fef2bf..0cf25fe1dbf3955387a87e30e1095f360825bcbc 100644 --- a/R/pkg/inst/tests/testthat/test_client.R +++ b/R/pkg/inst/tests/testthat/test_client.R @@ -37,9 +37,7 @@ test_that("multiple packages don't produce a warning", { test_that("sparkJars sparkPackages as character vectors", { args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "", - c("com.databricks:spark-avro_2.10:2.0.1", - "com.databricks:spark-csv_2.10:1.3.0")) + c("com.databricks:spark-avro_2.10:2.0.1")) expect_match(args, "--jars one.jar,two.jar,three.jar") - expect_match(args, - "--packages com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.3.0") + expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1") }) diff --git a/docs/sparkr.md b/docs/sparkr.md index 760534ae145fa4989b6709996a05cfc264a9adc6..9b5eaa1ec723241346ee81dc8a58874a891a5a82 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -115,13 +115,13 @@ head(df) SparkR supports operating on a variety of data sources through the `DataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. -The general method for creating DataFrames from data sources is `read.df`. This method takes in the `SQLContext`, the path for the file to load and the type of data source. SparkR supports reading JSON and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [CSV](http://spark-packages.org/package/databricks/spark-csv) and [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by +The general method for creating DataFrames from data sources is `read.df`. This method takes in the `SQLContext`, the path for the file to load and the type of data source. SparkR supports reading JSON, CSV and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by specifying `--packages` with `spark-submit` or `sparkR` commands, or if creating context through `init` you can specify the packages with the `packages` argument. <div data-lang="r" markdown="1"> {% highlight r %} -sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3") +sc <- sparkR.init(sparkPackages="com.databricks:spark-avro_2.11:2.0.1") sqlContext <- sparkRSQL.init(sc) {% endhighlight %} </div> diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 594bf49d601589623e6c32b805a854b54781398b..58a30135aa923ad6b75ba766d2ef1445acefc0e5 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -20,8 +20,7 @@ # The data set is made up of 227,496 rows x 14 columns. # To run this example use -# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3 -# examples/src/main/r/data-manipulation.R <path_to_csv> +# ./bin/spark-submit examples/src/main/r/data-manipulation.R <path_to_csv> # Load SparkR library into your R session library(SparkR) @@ -29,7 +28,7 @@ library(SparkR) args <- commandArgs(trailing = TRUE) if (length(args) != 1) { - print("Usage: data-manipulation.R <path-to-flights.csv") + print("Usage: data-manipulation.R <path-to-flights.csv>") print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv") q("no") } @@ -53,7 +52,7 @@ SFO_df <- flights_df[flights_df$dest == "SFO", ] SFO_DF <- createDataFrame(sqlContext, SFO_df) # Directly create a SparkDataFrame from the source data -flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true") +flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true") # Print the schema of this SparkDataFrame printSchema(flightsDF)