From f0169a1c6a1ac06045d57f8aaa2c841bb39e23ac Mon Sep 17 00:00:00 2001
From: zero323 <zero323@users.noreply.github.com>
Date: Mon, 1 May 2017 09:43:32 -0700
Subject: [PATCH] [SPARK-20290][MINOR][PYTHON][SQL] Add PySpark wrapper for
 eqNullSafe

## What changes were proposed in this pull request?

Adds Python bindings for `Column.eqNullSafe`

## How was this patch tested?

Manual tests, existing unit tests, doc build.

Author: zero323 <zero323@users.noreply.github.com>

Closes #17605 from zero323/SPARK-20290.
---
 python/pyspark/sql/column.py | 55 ++++++++++++++++++++++++++++++++++++
 python/pyspark/sql/tests.py  |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index b8df37f251..e753ed402c 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -171,6 +171,61 @@ class Column(object):
     __ge__ = _bin_op("geq")
     __gt__ = _bin_op("gt")
 
+    _eqNullSafe_doc = """
+    Equality test that is safe for null values.
+
+    :param other: a value or :class:`Column`
+
+    >>> from pyspark.sql import Row
+    >>> df1 = spark.createDataFrame([
+    ...     Row(id=1, value='foo'),
+    ...     Row(id=2, value=None)
+    ... ])
+    >>> df1.select(
+    ...     df1['value'] == 'foo',
+    ...     df1['value'].eqNullSafe('foo'),
+    ...     df1['value'].eqNullSafe(None)
+    ... ).show()
+    +-------------+---------------+----------------+
+    |(value = foo)|(value <=> foo)|(value <=> NULL)|
+    +-------------+---------------+----------------+
+    |         true|           true|           false|
+    |         null|          false|            true|
+    +-------------+---------------+----------------+
+    >>> df2 = spark.createDataFrame([
+    ...     Row(value = 'bar'),
+    ...     Row(value = None)
+    ... ])
+    >>> df1.join(df2, df1["value"] == df2["value"]).count()
+    0
+    >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()
+    1
+    >>> df2 = spark.createDataFrame([
+    ...     Row(id=1, value=float('NaN')),
+    ...     Row(id=2, value=42.0),
+    ...     Row(id=3, value=None)
+    ... ])
+    >>> df2.select(
+    ...     df2['value'].eqNullSafe(None),
+    ...     df2['value'].eqNullSafe(float('NaN')),
+    ...     df2['value'].eqNullSafe(42.0)
+    ... ).show()
+    +----------------+---------------+----------------+
+    |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+    +----------------+---------------+----------------+
+    |           false|           true|           false|
+    |           false|          false|            true|
+    |            true|          false|           false|
+    +----------------+---------------+----------------+
+
+    .. note:: Unlike Pandas, PySpark doesn't consider NaN values to be NULL.
+       See the `NaN Semantics`_ for details.
+    .. _NaN Semantics:
+       https://spark.apache.org/docs/latest/sql-programming-guide.html#nan-semantics
+    .. versionadded:: 2.3.0
+    """
+    eqNullSafe = _bin_op("eqNullSafe", _eqNullSafe_doc)
+
     # `and`, `or`, `not` cannot be overloaded in Python,
     # so use bitwise operators as boolean operators
     __and__ = _bin_op('and')
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cd92148dfa..ce4abf8fb7 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -982,7 +982,7 @@ class SQLTests(ReusedPySparkTestCase):
         cbool = (ci & ci), (ci | ci), (~ci)
         self.assertTrue(all(isinstance(c, Column) for c in cbool))
         css = cs.contains('a'), cs.like('a'), cs.rlike('a'), cs.asc(), cs.desc(),\
-            cs.startswith('a'), cs.endswith('a')
+            cs.startswith('a'), cs.endswith('a'), ci.eqNullSafe(cs)
         self.assertTrue(all(isinstance(c, Column) for c in css))
         self.assertTrue(isinstance(ci.cast(LongType()), Column))
         self.assertRaisesRegexp(ValueError,
-- 
GitLab