[SPARK-8727] [SQL] Missing python api; md5, log2

Jira: https://issues.apache.org/jira/browse/SPARK-8727 Author: Tarek Auel <tarek.auel@gmail.com> Author: Tarek Auel <tarek.auel@googlemail.com> Closes #7114 from tarekauel/missing-python and squashes the following commits: ef4c61b [Tarek Auel] [SPARK-8727] revert dataframe change 4029d4d [Tarek Auel] removed dataframe pi and e unit test 66f0d2b [Tarek Auel] removed pi and e from python api and dataframe api; added _to_java_column(col) for strlen 4d07318 [Tarek Auel] fixed python unit test 45f2bee [Tarek Auel] fixed result of pi and e c39f47b [Tarek Auel] add python api bd50a3a [Tarek Auel] add missing python functions

[SPARK-8727] [SQL] Missing python api; md5, log2
ccdb0522 · Tarek Auel · Davies Liu · 8133125c · ccdb0522
Commit ccdb0522 authored 10 years ago by Tarek Auel Committed by Davies Liu 10 years ago
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -39,12 +39,15 @@ __all__ = [
    'coalesce',
    'countDistinct',
    'explode',
+    'log2',
+    'md5',
    'monotonicallyIncreasingId',
    'rand',
    'randn',
    'sha1',
    'sha2',
    'sparkPartitionId',
+    'strlen',
    'struct',
    'udf',
    'when']
@@ -320,6 +323,19 @@ def explode(col):
    return Column(jc)


+@ignore_unicode_prefix
+@since(1.5)
+def md5(col):
+    """Calculates the MD5 digest and returns the value as a 32 character hex string.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
+    [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.md5(_to_java_column(col))
+    return Column(jc)
+
+
 @since(1.4)
 def monotonicallyIncreasingId():
    """A column that generates monotonically increasing 64-bit integers.
@@ -365,6 +381,19 @@ def randn(seed=None):
    return Column(jc)


+@ignore_unicode_prefix
+@since(1.5)
+def sha1(col):
+    """Returns the hex string result of SHA-1.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
+    [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.sha1(_to_java_column(col))
+    return Column(jc)
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def sha2(col, numBits):
@@ -383,19 +412,6 @@ def sha2(col, numBits):
    return Column(jc)


-@ignore_unicode_prefix
-@since(1.5)
-def sha1(col):
-    """Returns the hex string result of SHA-1.
-
-    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
-    [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
-    """
-    sc = SparkContext._active_spark_context
-    jc = sc._jvm.functions.sha1(_to_java_column(col))
-    return Column(jc)
-
-
 @since(1.4)
 def sparkPartitionId():
    """A column for partition ID of the Spark task.
@@ -409,6 +425,18 @@ def sparkPartitionId():
    return Column(sc._jvm.functions.sparkPartitionId())


+@ignore_unicode_prefix
+@since(1.5)
+def strlen(col):
+    """Calculates the length of a string expression.
+
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(strlen('a').alias('length')).collect()
+    [Row(length=3)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.strlen(_to_java_column(col)))
+
+
 @ignore_unicode_prefix
 @since(1.4)
 def struct(*cols):
@@ -471,6 +499,17 @@ def log(arg1, arg2=None):
    return Column(jc)


+@since(1.5)
+def log2(col):
+    """Returns the base-2 logarithm of the argument.
+
+    >>> sqlContext.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect()
+    [Row(log2=2.0)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.log2(_to_java_column(col)))
+
+
 @since(1.4)
 def lag(col, count=1, default=None):
    """