From 710c2b5dd2dc6b8d947303ad8dfae4539b63fe11 Mon Sep 17 00:00:00 2001 From: Hossein <hossein@databricks.com> Date: Fri, 31 Jul 2015 14:07:41 -0700 Subject: [PATCH] [SPARK-9324] [SPARK-9322] [SPARK-9321] [SPARKR] Some aliases for R-like functions in DataFrames Adds following aliases: * unique (distinct) * rbind (unionAll): accepts many DataFrames * nrow (count) * ncol * dim * names (columns): along with the replacement function to change names Author: Hossein <hossein@databricks.com> Closes #7764 from falaki/sparkR-alias and squashes the following commits: 56016f5 [Hossein] Updated R documentation 5e4a4d0 [Hossein] Removed extra code f51cbef [Hossein] Merge branch 'master' into sparkR-alias c1b88bd [Hossein] Moved setGeneric and other comments applied d9307f8 [Hossein] Added tests b5aa988 [Hossein] Added dim, ncol, nrow, names, rbind, and unique functions to DataFrames --- R/pkg/NAMESPACE | 6 +++ R/pkg/R/DataFrame.R | 90 ++++++++++++++++++++++++++++++++ R/pkg/R/generics.R | 4 ++ R/pkg/inst/tests/test_sparkSQL.R | 22 ++++++-- 4 files changed, 119 insertions(+), 3 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index a329e14f25..ff116cb1fb 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -29,6 +29,7 @@ exportMethods("arrange", "count", "crosstab", "describe", + "dim", "distinct", "dropna", "dtypes", @@ -45,11 +46,15 @@ exportMethods("arrange", "isLocal", "join", "limit", + "names", + "ncol", + "nrow", "orderBy", "mutate", "names", "persist", "printSchema", + "rbind", "registerTempTable", "rename", "repartition", @@ -66,6 +71,7 @@ exportMethods("arrange", "summarize", "take", "unionAll", + "unique", "unpersist", "where", "withColumn", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index b31ad3729e..b4065d2944 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -255,6 +255,16 @@ setMethod("names", columns(x) }) +#' @rdname columns +setMethod("names<-", + signature(x = "DataFrame"), + function(x, value) { + if (!is.null(value)) { + sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value))) + dataFrame(sdf) + } + }) + #' Register Temporary Table #' #' Registers a DataFrame as a Temporary Table in the SQLContext @@ -473,6 +483,18 @@ setMethod("distinct", dataFrame(sdf) }) +#' @title Distinct rows in a DataFrame +# +#' @description Returns a new DataFrame containing distinct rows in this DataFrame +#' +#' @rdname unique +#' @aliases unique +setMethod("unique", + signature(x = "DataFrame"), + function(x) { + distinct(x) + }) + #' Sample #' #' Return a sampled subset of this DataFrame using a random seed. @@ -534,6 +556,58 @@ setMethod("count", callJMethod(x@sdf, "count") }) +#' @title Number of rows for a DataFrame +#' @description Returns number of rows in a DataFrames +#' +#' @name nrow +#' +#' @rdname nrow +#' @aliases count +setMethod("nrow", + signature(x = "DataFrame"), + function(x) { + count(x) + }) + +#' Returns the number of columns in a DataFrame +#' +#' @param x a SparkSQL DataFrame +#' +#' @rdname ncol +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlContext <- sparkRSQL.init(sc) +#' path <- "path/to/file.json" +#' df <- jsonFile(sqlContext, path) +#' ncol(df) +#' } +setMethod("ncol", + signature(x = "DataFrame"), + function(x) { + length(columns(x)) + }) + +#' Returns the dimentions (number of rows and columns) of a DataFrame +#' @param x a SparkSQL DataFrame +#' +#' @rdname dim +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlContext <- sparkRSQL.init(sc) +#' path <- "path/to/file.json" +#' df <- jsonFile(sqlContext, path) +#' dim(df) +#' } +setMethod("dim", + signature(x = "DataFrame"), + function(x) { + c(count(x), ncol(x)) + }) + #' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame. #' #' @param x A SparkSQL DataFrame @@ -1231,6 +1305,22 @@ setMethod("unionAll", dataFrame(unioned) }) +#' @title Union two or more DataFrames +# +#' @description Returns a new DataFrame containing rows of all parameters. +# +#' @rdname rbind +#' @aliases unionAll +setMethod("rbind", + signature(... = "DataFrame"), + function(x, ..., deparse.level = 1) { + if (nargs() == 3) { + unionAll(x, ...) + } else { + unionAll(x, Recall(..., deparse.level = 1)) + } + }) + #' Intersect #' #' Return a new DataFrame containing rows only in both this DataFrame diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index a3a121058e..71d1e348c4 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -669,3 +669,7 @@ setGeneric("upper", function(x) { standardGeneric("upper") }) #' @rdname glm #' @export setGeneric("glm") + +#' @rdname rbind +#' @export +setGeneric("rbind", signature = "...") diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 25f697314f..9faee8d59c 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -88,6 +88,9 @@ test_that("create DataFrame from RDD", { df <- createDataFrame(sqlContext, rdd, list("a", "b")) expect_is(df, "DataFrame") expect_equal(count(df), 10) + expect_equal(nrow(df), 10) + expect_equal(ncol(df), 2) + expect_equal(dim(df), c(10, 2)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) @@ -491,7 +494,7 @@ test_that("head() and first() return the correct data", { expect_equal(nrow(testFirst), 1) }) -test_that("distinct() on DataFrames", { +test_that("distinct() and unique on DataFrames", { lines <- c("{\"name\":\"Michael\"}", "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Justin\", \"age\":19}", @@ -503,6 +506,10 @@ test_that("distinct() on DataFrames", { uniques <- distinct(df) expect_is(uniques, "DataFrame") expect_equal(count(uniques), 3) + + uniques2 <- unique(df) + expect_is(uniques2, "DataFrame") + expect_equal(count(uniques2), 3) }) test_that("sample on a DataFrame", { @@ -815,7 +822,7 @@ test_that("isLocal()", { expect_false(isLocal(df)) }) -test_that("unionAll(), except(), and intersect() on a DataFrame", { +test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) lines <- c("{\"name\":\"Bob\", \"age\":24}", @@ -830,6 +837,11 @@ test_that("unionAll(), except(), and intersect() on a DataFrame", { expect_equal(count(unioned), 6) expect_equal(first(unioned)$name, "Michael") + unioned2 <- arrange(rbind(unioned, df, df2), df$age) + expect_is(unioned2, "DataFrame") + expect_equal(count(unioned2), 12) + expect_equal(first(unioned2)$name, "Michael") + excepted <- arrange(except(df, df2), desc(df$age)) expect_is(unioned, "DataFrame") expect_equal(count(excepted), 2) @@ -853,7 +865,7 @@ test_that("withColumn() and withColumnRenamed()", { expect_equal(columns(newDF2)[1], "newerAge") }) -test_that("mutate() and rename()", { +test_that("mutate(), rename() and names()", { df <- jsonFile(sqlContext, jsonPath) newDF <- mutate(df, newAge = df$age + 2) expect_equal(length(columns(newDF)), 3) @@ -863,6 +875,10 @@ test_that("mutate() and rename()", { newDF2 <- rename(df, newerAge = df$age) expect_equal(length(columns(newDF2)), 2) expect_equal(columns(newDF2)[1], "newerAge") + + names(newDF2) <- c("newerName", "evenNewerAge") + expect_equal(length(names(newDF2)), 2) + expect_equal(names(newDF2)[1], "newerName") }) test_that("write.df() on DataFrame and works with parquetFile", { -- GitLab