diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3b2fd733753eec52201145d9fabe25647369e710..890d15dfee49cb76b07ac57e62ce5bceec92e91e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -845,7 +845,7 @@ setMethod("ncol", length(columns(x)) }) -#' Returns the dimentions (number of rows and columns) of a SparkDataFrame +#' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' #' @family SparkDataFrame functions diff --git a/docs/sparkr.md b/docs/sparkr.md index a0b4f9377642dfaf0168be7ee8e677a5e4047495..760534ae145fa4989b6709996a05cfc264a9adc6 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -141,7 +141,7 @@ head(people) # SparkR automatically infers the schema from the JSON file printSchema(people) # root -# |-- age: integer (nullable = true) +# |-- age: long (nullable = true) # |-- name: string (nullable = true) {% endhighlight %} @@ -195,7 +195,7 @@ df <- createDataFrame(sqlContext, faithful) # Get basic information about the DataFrame df -## DataFrame[eruptions:double, waiting:double] +## SparkDataFrame[eruptions:double, waiting:double] # Select only the "eruptions" column head(select(df, df$eruptions)) @@ -228,14 +228,13 @@ SparkR data frames support a number of commonly used functions to aggregate data # We use the `n` operator to count the number of times each waiting time appears head(summarize(groupBy(df, df$waiting), count = n(df$waiting))) ## waiting count -##1 81 13 -##2 60 6 -##3 68 1 +##1 70 4 +##2 67 1 +##3 69 2 # We can also sort the output from the aggregation to get the most common waiting times waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting)) head(arrange(waiting_counts, desc(waiting_counts$count))) - ## waiting count ##1 78 15 ##2 83 14 diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 77887f4ca36be0d53833d036fdd41b4ece3be65c..9a3db9c3f9b73c92488be6a8f2ae2e418f30e4cf 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -173,7 +173,7 @@ df.show() {% highlight r %} sqlContext <- SQLContext(sc) -df <- jsonFile(sqlContext, "examples/src/main/resources/people.json") +df <- read.json(sqlContext, "examples/src/main/resources/people.json") # Displays the content of the DataFrame to stdout showDF(df) @@ -366,7 +366,7 @@ In addition to simple column references and expressions, DataFrames also have a sqlContext <- sparkRSQL.init(sc) # Create the DataFrame -df <- jsonFile(sqlContext, "examples/src/main/resources/people.json") +df <- read.json(sqlContext, "examples/src/main/resources/people.json") # Show the content of the DataFrame showDF(df) @@ -889,8 +889,8 @@ df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") <div data-lang="r" markdown="1"> {% highlight r %} -df <- loadDF(sqlContext, "people.parquet") -saveDF(select(df, "name", "age"), "namesAndAges.parquet") +df <- read.df(sqlContext, "examples/src/main/resources/users.parquet") +write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") {% endhighlight %} </div> @@ -939,8 +939,8 @@ df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") {% highlight r %} -df <- loadDF(sqlContext, "people.json", "json") -saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet") +df <- read.df(sqlContext, "examples/src/main/resources/people.json", "json") +write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet") {% endhighlight %} @@ -1138,19 +1138,15 @@ for teenName in teenNames.collect(): schemaPeople # The DataFrame from the previous example. # DataFrames can be saved as Parquet files, maintaining the schema information. -saveAsParquetFile(schemaPeople, "people.parquet") +write.parquet(schemaPeople, "people.parquet") # Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. # The result of loading a parquet file is also a DataFrame. -parquetFile <- parquetFile(sqlContext, "people.parquet") +parquetFile <- read.parquet(sqlContext, "people.parquet") # Parquet files can also be registered as tables and then used in SQL statements. -registerTempTable(parquetFile, "parquetFile"); +registerTempTable(parquetFile, "parquetFile") teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") -teenNames <- map(teenagers, function(p) { paste("Name:", p$name)}) -for (teenName in collect(teenNames)) { - cat(teenName, "\n") -} {% endhighlight %} </div> @@ -1318,14 +1314,14 @@ df3.printSchema() # sqlContext from the previous example is used in this example. # Create a simple DataFrame, stored into a partition directory -saveDF(df1, "data/test_table/key=1", "parquet", "overwrite") +write.df(df1, "data/test_table/key=1", "parquet", "overwrite") # Create another DataFrame in a new partition directory, # adding a new column and dropping an existing column -saveDF(df2, "data/test_table/key=2", "parquet", "overwrite") +write.df(df2, "data/test_table/key=2", "parquet", "overwrite") # Read the partitioned table -df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true") +df3 <- read.df(sqlContext, "data/test_table", "parquet", mergeSchema="true") printSchema(df3) # The final schema consists of all 3 columns in the Parquet files together @@ -1612,7 +1608,7 @@ sqlContext <- sparkRSQL.init(sc) # The path can be either a single text file or a directory storing text files. path <- "examples/src/main/resources/people.json" # Create a DataFrame from the file(s) pointed to by path -people <- jsonFile(sqlContext, path) +people <- read.json(sqlContext, path) # The inferred schema can be visualized using the printSchema() method. printSchema(people) diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index aa2336e300a9171e7f36f99dd9d8efe62a0b13c9..594bf49d601589623e6c32b805a854b54781398b 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -30,7 +30,7 @@ args <- commandArgs(trailing = TRUE) if (length(args) != 1) { print("Usage: data-manipulation.R <path-to-flights.csv") - print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ") + print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv") q("no") } @@ -49,33 +49,33 @@ flights_df$date <- as.Date(flights_df$date) ## Filter flights whose destination is San Francisco and write to a local data frame SFO_df <- flights_df[flights_df$dest == "SFO", ] -# Convert the local data frame into a SparkR DataFrame +# Convert the local data frame into a SparkDataFrame SFO_DF <- createDataFrame(sqlContext, SFO_df) -# Directly create a SparkR DataFrame from the source data +# Directly create a SparkDataFrame from the source data flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true") -# Print the schema of this Spark DataFrame +# Print the schema of this SparkDataFrame printSchema(flightsDF) -# Cache the DataFrame +# Cache the SparkDataFrame cache(flightsDF) -# Print the first 6 rows of the DataFrame +# Print the first 6 rows of the SparkDataFrame showDF(flightsDF, numRows = 6) ## Or head(flightsDF) -# Show the column names in the DataFrame +# Show the column names in the SparkDataFrame columns(flightsDF) -# Show the number of rows in the DataFrame +# Show the number of rows in the SparkDataFrame count(flightsDF) # Select specific columns destDF <- select(flightsDF, "dest", "cancelled") # Using SQL to select columns of data -# First, register the flights DataFrame as a table +# First, register the flights SparkDataFrame as a table registerTempTable(flightsDF, "flightsTable") destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable") @@ -95,11 +95,11 @@ if("magrittr" %in% rownames(installed.packages())) { library(magrittr) # Group the flights by date and then find the average daily delay - # Write the result into a DataFrame + # Write the result into a SparkDataFrame groupBy(flightsDF, flightsDF$date) %>% summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF - # Print the computed data frame + # Print the computed SparkDataFrame head(dailyDelayDF) } diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 62f60e57eebe6a41cfa47191ca65fce6c2669af9..436bac6aaf455e89a0b40b81858bd625e34cb929 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -24,7 +24,7 @@ sqlContext <- sparkRSQL.init(sc) # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) -# Convert local data frame to a SparkR DataFrame +# Convert local data frame to a SparkDataFrame df <- createDataFrame(sqlContext, localDF) # Print its schema