Skip to content
Snippets Groups Projects
Commit 6d0411b4 authored by Cheng Hao's avatar Cheng Hao Committed by Reynold Xin
Browse files

[SQL][Minor] Update the DataFrame API for encode/decode

This is a the follow up of #6843.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7230 from chenghao-intel/str_funcs2_followup and squashes the following commits:

52cc553 [Cheng Hao] update the code as comment
parent a0cb111b
No related branches found
No related tags found
No related merge requests found
...@@ -392,12 +392,13 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput ...@@ -392,12 +392,13 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput
/** /**
* Decodes the first argument into a String using the provided character set * Decodes the first argument into a String using the provided character set
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null. (As of Hive 0.12.0.). * If either argument is null, the result will also be null.
*/ */
case class Decode(bin: Expression, charset: Expression) extends Expression with ExpectsInputTypes { case class Decode(bin: Expression, charset: Expression)
override def children: Seq[Expression] = bin :: charset :: Nil extends BinaryExpression with ExpectsInputTypes {
override def foldable: Boolean = bin.foldable && charset.foldable
override def nullable: Boolean = bin.nullable || charset.nullable override def left: Expression = bin
override def right: Expression = charset
override def dataType: DataType = StringType override def dataType: DataType = StringType
override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType) override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType)
...@@ -420,13 +421,13 @@ case class Decode(bin: Expression, charset: Expression) extends Expression with ...@@ -420,13 +421,13 @@ case class Decode(bin: Expression, charset: Expression) extends Expression with
/** /**
* Encodes the first argument into a BINARY using the provided character set * Encodes the first argument into a BINARY using the provided character set
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null. (As of Hive 0.12.0.) * If either argument is null, the result will also be null.
*/ */
case class Encode(value: Expression, charset: Expression) case class Encode(value: Expression, charset: Expression)
extends Expression with ExpectsInputTypes { extends BinaryExpression with ExpectsInputTypes {
override def children: Seq[Expression] = value :: charset :: Nil
override def foldable: Boolean = value.foldable && charset.foldable override def left: Expression = value
override def nullable: Boolean = value.nullable || charset.nullable override def right: Expression = charset
override def dataType: DataType = BinaryType override def dataType: DataType = BinaryType
override def inputTypes: Seq[DataType] = Seq(StringType, StringType) override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
......
...@@ -1666,18 +1666,19 @@ object functions { ...@@ -1666,18 +1666,19 @@ object functions {
* @group string_funcs * @group string_funcs
* @since 1.5.0 * @since 1.5.0
*/ */
def encode(value: Column, charset: Column): Column = Encode(value.expr, charset.expr) def encode(value: Column, charset: String): Column = Encode(value.expr, lit(charset).expr)
/** /**
* Computes the first argument into a binary from a string using the provided character set * Computes the first argument into a binary from a string using the provided character set
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null. * If either argument is null, the result will also be null.
* NOTE: charset represents the string value of the character set, not the column name.
* *
* @group string_funcs * @group string_funcs
* @since 1.5.0 * @since 1.5.0
*/ */
def encode(columnName: String, charsetColumnName: String): Column = def encode(columnName: String, charset: String): Column =
encode(Column(columnName), Column(charsetColumnName)) encode(Column(columnName), charset)
/** /**
* Computes the first argument into a string from a binary using the provided character set * Computes the first argument into a string from a binary using the provided character set
...@@ -1687,18 +1688,19 @@ object functions { ...@@ -1687,18 +1688,19 @@ object functions {
* @group string_funcs * @group string_funcs
* @since 1.5.0 * @since 1.5.0
*/ */
def decode(value: Column, charset: Column): Column = Decode(value.expr, charset.expr) def decode(value: Column, charset: String): Column = Decode(value.expr, lit(charset).expr)
/** /**
* Computes the first argument into a string from a binary using the provided character set * Computes the first argument into a string from a binary using the provided character set
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null. * If either argument is null, the result will also be null.
* NOTE: charset represents the string value of the character set, not the column name.
* *
* @group string_funcs * @group string_funcs
* @since 1.5.0 * @since 1.5.0
*/ */
def decode(columnName: String, charsetColumnName: String): Column = def decode(columnName: String, charset: String): Column =
decode(Column(columnName), Column(charsetColumnName)) decode(Column(columnName), charset)
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
......
...@@ -261,11 +261,15 @@ class DataFrameFunctionsSuite extends QueryTest { ...@@ -261,11 +261,15 @@ class DataFrameFunctionsSuite extends QueryTest {
// non ascii characters are not allowed in the code, so we disable the scalastyle here. // non ascii characters are not allowed in the code, so we disable the scalastyle here.
val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c") val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c")
checkAnswer( checkAnswer(
df.select(encode($"a", $"b"), encode("a", "b"), decode($"c", $"b"), decode("c", "b")), df.select(
encode($"a", "utf-8"),
encode("a", "utf-8"),
decode($"c", "utf-8"),
decode("c", "utf-8")),
Row(bytes, bytes, "大千世界", "大千世界")) Row(bytes, bytes, "大千世界", "大千世界"))
checkAnswer( checkAnswer(
df.selectExpr("encode(a, b)", "decode(c, b)"), df.selectExpr("encode(a, 'utf-8')", "decode(c, 'utf-8')"),
Row(bytes, "大千世界")) Row(bytes, "大千世界"))
// scalastyle:on // scalastyle:on
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment