From 5c9b3017279e4f20c364ae92a1fd059d4cfe9f4f Mon Sep 17 00:00:00 2001 From: Andrew Ray <ray.andrew@gmail.com> Date: Mon, 21 Aug 2017 23:08:27 -0700 Subject: [PATCH] [SPARK-21584][SQL][SPARKR] Update R method for summary to call new implementation ## What changes were proposed in this pull request? SPARK-21100 introduced a new `summary` method to the Scala/Java Dataset API that included expanded statistics (vs `describe`) and control over which statistics to compute. Currently in the R API `summary` acts as an alias for `describe`. This patch updates the R API to call the new `summary` method in the JVM that includes additional statistics and ability to select which to compute. This does not break the current interface as the present `summary` method does not take additional arguments like `describe` and the output was never meant to be used programmatically. ## How was this patch tested? Modified and additional unit tests. Author: Andrew Ray <ray.andrew@gmail.com> Closes #18786 from aray/summary-r. --- R/pkg/R/DataFrame.R | 44 ++++++++++++++++++++++++--- R/pkg/R/generics.R | 2 +- R/pkg/tests/fulltests/test_sparkSQL.R | 19 ++++++++---- 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5d6f9c0422..80526cdd4f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2930,7 +2930,7 @@ setMethod("saveAsTable", invisible(callJMethod(write, "saveAsTable", tableName)) }) -#' summary +#' describe #' #' Computes statistics for numeric and string columns. #' If no columns are given, this function computes statistics for all numerical or string columns. @@ -2941,7 +2941,7 @@ setMethod("saveAsTable", #' @return A SparkDataFrame. #' @family SparkDataFrame functions #' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method -#' @rdname summary +#' @rdname describe #' @name describe #' @export #' @examples @@ -2953,6 +2953,7 @@ setMethod("saveAsTable", #' describe(df, "col1") #' describe(df, "col1", "col2") #' } +#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute. #' @note describe(SparkDataFrame, character) since 1.4.0 setMethod("describe", signature(x = "SparkDataFrame", col = "character"), @@ -2962,7 +2963,7 @@ setMethod("describe", dataFrame(sdf) }) -#' @rdname summary +#' @rdname describe #' @name describe #' @aliases describe,SparkDataFrame-method #' @note describe(SparkDataFrame) since 1.4.0 @@ -2973,15 +2974,50 @@ setMethod("describe", dataFrame(sdf) }) +#' summary +#' +#' Computes specified statistics for numeric and string columns. Available statistics are: +#' \itemize{ +#' \item count +#' \item mean +#' \item stddev +#' \item min +#' \item max +#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%") +#' } +#' If no statistics are given, this function computes count, mean, stddev, min, +#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max. +#' This function is meant for exploratory data analysis, as we make no guarantee about the +#' backward compatibility of the schema of the resulting Dataset. If you want to +#' programmatically compute summary statistics, use the \code{agg} function instead. +#' +#' #' @param object a SparkDataFrame to be summarized. +#' @param ... (optional) statistics to be computed for all columns. +#' @return A SparkDataFrame. +#' @family SparkDataFrame functions #' @rdname summary #' @name summary #' @aliases summary,SparkDataFrame-method +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' path <- "path/to/file.json" +#' df <- read.json(path) +#' summary(df) +#' summary(df, "min", "25%", "75%", "max") +#' summary(select(df, "age", "height")) +#' } #' @note summary(SparkDataFrame) since 1.5.0 +#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults. +#' @seealso \link{describe} setMethod("summary", signature(object = "SparkDataFrame"), function(object, ...) { - describe(object) + statisticsList <- list(...) + sdf <- callJMethod(object@sdf, "summary", statisticsList) + dataFrame(sdf) }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index df91c35f7d..f0cc2dc3f1 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect") # @export setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") }) -#' @rdname summary +#' @rdname describe #' @export setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index deb0e163a8..d477fc6a42 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2497,7 +2497,7 @@ test_that("read/write text files - compression option", { unlink(textPath) }) -test_that("describe() and summarize() on a DataFrame", { +test_that("describe() and summary() on a DataFrame", { df <- read.json(jsonPath) stats <- describe(df, "age") expect_equal(collect(stats)[1, "summary"], "count") @@ -2508,8 +2508,15 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "summary"], "min") - expect_equal(collect(stats2)[5, "age"], "30") + expect_equal(collect(stats2)[5, "summary"], "25%") + expect_equal(collect(stats2)[5, "age"], "30.0") + + stats3 <- summary(df, "min", "max", "55.1%") + + expect_equal(collect(stats3)[1, "summary"], "min") + expect_equal(collect(stats3)[2, "summary"], "max") + expect_equal(collect(stats3)[3, "summary"], "55.1%") + expect_equal(collect(stats3)[3, "age"], "30.0") # SPARK-16425: SparkR summary() fails on column of type logical df <- withColumn(df, "boolean", df$age == 30) @@ -2742,15 +2749,15 @@ test_that("attach() on a DataFrame", { expected_age <- data.frame(age = c(NA, 30, 19)) expect_equal(head(age), expected_age) stat <- summary(age) - expect_equal(collect(stat)[5, "age"], "30") + expect_equal(collect(stat)[8, "age"], "30") age <- age$age + 1 expect_is(age, "Column") rm(age) stat2 <- summary(age) - expect_equal(collect(stat2)[5, "age"], "30") + expect_equal(collect(stat2)[8, "age"], "30") detach("df") stat3 <- summary(df[, "age", drop = F]) - expect_equal(collect(stat3)[5, "age"], "30") + expect_equal(collect(stat3)[8, "age"], "30") expect_error(age) }) -- GitLab