Skip to content
Snippets Groups Projects
Commit a4669443 authored by qhuang's avatar qhuang Committed by Reynold Xin
Browse files

[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241
sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242)

Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841

Author: qhuang <qian.huang@intel.com>

Closes #5446 from hqzizania/R and squashes the following commits:

f283572 [qhuang] add test unit for describe()
2e74d5a [qhuang] add describe() DataFrame API
parent 51b3d41e
No related branches found
No related tags found
No related merge requests found
...@@ -13,6 +13,7 @@ exportMethods("cache", ...@@ -13,6 +13,7 @@ exportMethods("cache",
"collect", "collect",
"columns", "columns",
"count", "count",
"describe",
"distinct", "distinct",
"dtypes", "dtypes",
"except", "except",
......
...@@ -1276,3 +1276,40 @@ setMethod("saveAsTable", ...@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options) callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
}) })
#' describe
#'
#' Computes statistics for numeric columns.
#' If no columns are given, this function computes statistics for all numerical columns.
#'
#' @param x A DataFrame to be computed.
#' @param col A string of name
#' @param ... Additional expressions
#' @return A DataFrame
#' @rdname describe
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' path <- "path/to/file.json"
#' df <- jsonFile(sqlCtx, path)
#' describe(df)
#' describe(df, "col1")
#' describe(df, "col1", "col2")
#' }
setMethod("describe",
signature(x = "DataFrame", col = "character"),
function(x, col, ...) {
colList <- list(col, ...)
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})
#' @rdname describe
setMethod("describe",
signature(x = "DataFrame"),
function(x) {
colList <- as.list(c(columns(x)))
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})
...@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) ...@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
#' @export #' @export
setGeneric("columns", function(x) {standardGeneric("columns") }) setGeneric("columns", function(x) {standardGeneric("columns") })
#' @rdname describe
#' @export
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
#' @rdname schema #' @rdname schema
#' @export #' @export
setGeneric("dtypes", function(x) { standardGeneric("dtypes") }) setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
......
...@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", { ...@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
expect_true(count(parquetDF) == count(df)*2) expect_true(count(parquetDF) == count(df)*2)
}) })
test_that("describe() on a DataFrame", {
df <- jsonFile(sqlCtx, jsonPath)
stats <- describe(df, "age")
expect_true(collect(stats)[1, "summary"] == "count")
expect_true(collect(stats)[2, "age"] == 24.5)
expect_true(collect(stats)[3, "age"] == 5.5)
stats <- describe(df)
expect_true(collect(stats)[4, "name"] == "Andy")
expect_true(collect(stats)[5, "age"] == 30.0)
})
unlink(parquetPath) unlink(parquetPath)
unlink(jsonPath) unlink(jsonPath)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment