diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index dac842c0ce8c026ae7b35d15d1ea29fdeb94e0f2..716b16fdc9530637222e091d74af953eb49bc22a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1756,6 +1756,9 @@ class UserDefinedFunction(object): @since(1.3) def udf(f, returnType=StringType()): """Creates a :class:`Column` expression representing a user defined function (UDF). + Note that the user-defined functions must be deterministic. Due to optimization, + duplicate invocations may be eliminated or the function may even be invoked more times than + it is present in the query. >>> from pyspark.sql.types import IntegerType >>> slen = udf(lambda s: len(s), IntegerType()) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala index 0038cf65e2993ea772e5529045fd1ac40289ac0f..21390644bc0b69cc7cba4815ef0f754d5c0627a5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.types.DataType /** * User-defined function. + * Note that the user-defined functions must be deterministic. * @param function The user defined scala function to run. * Note that if you use primitive parameters, you are not able to check if it is * null or not, and the UDF will return null for you if the primitive input is diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 14d12d30bc0b377dcd70f361aa5fa47013008f36..7013e316ead8380e79b6c30e0a9230c063f0d2ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -199,6 +199,9 @@ class SQLContext private[sql]( /** * A collection of methods for registering user-defined functions (UDF). + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * * The following example registers a Scala closure as UDF: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index f697769bdcdb5f35bca4499e588e213a1e26b9ba..5c87c844185c5359ce4d874be6f30827a88df9d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -145,6 +145,9 @@ class SparkSession private( /** * A collection of methods for registering user-defined functions (UDF). + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * * The following example registers a Scala closure as UDF: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 3a043dcc6af227ce7825a138754dfe8da1d47a6d..b006236481a29712bb62a4cae11aa5c2d91c74f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.types.DataType /** * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this. + * Note that the user-defined functions must be deterministic. * * @since 1.3.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index bd35d19aa20bb8e13f71d7d6e8ac3f2969c8bd14..49fdec57558e8a78b6779a3b21896b53254c7a12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -25,6 +25,9 @@ import org.apache.spark.sql.types.DataType /** * A user-defined function. To create one, use the `udf` functions in [[functions]]. + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * As an example: * {{{ * // Defined a UDF that returns true or false based on some numeric score. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 939b9195cae998993f5df40fe2fc2bb51c32bf6a..c9cc2ba04a4131878e25b98bef8f95081c8c55f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -100,6 +100,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { /** * Interface exposed to the user for registering user-defined functions. + * Note that the user-defined functions must be deterministic. */ lazy val udf: UDFRegistration = new UDFRegistration(functionRegistry)