From f18fd05b513b136363c94adb3e5b841f8bf48134 Mon Sep 17 00:00:00 2001
From: Kan Zhang <kzhang@apache.org>
Date: Wed, 21 May 2014 13:26:53 -0700
Subject: [PATCH] [SPARK-1519] Support minPartitions param of wholeTextFiles()
 in PySpark

Author: Kan Zhang <kzhang@apache.org>

Closes #697 from kanzhang/SPARK-1519 and squashes the following commits:

4f8d1ed [Kan Zhang] [SPARK-1519] Support minPartitions param of wholeTextFiles() in PySpark
---
 python/pyspark/context.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index cac133d0fc..c9ff82d23b 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -211,6 +211,13 @@ class SparkContext(object):
         """
         return self._jsc.sc().defaultParallelism()
 
+    @property
+    def defaultMinPartitions(self):
+        """
+        Default min number of partitions for Hadoop RDDs when not given by user
+        """
+        return self._jsc.sc().defaultMinPartitions()
+
     def __del__(self):
         self.stop()
 
@@ -264,7 +271,7 @@ class SparkContext(object):
         return RDD(self._jsc.textFile(name, minPartitions), self,
                    UTF8Deserializer())
 
-    def wholeTextFiles(self, path):
+    def wholeTextFiles(self, path, minPartitions=None):
         """
         Read a directory of text files from HDFS, a local file system
         (available on all nodes), or any  Hadoop-supported file system
@@ -300,7 +307,8 @@ class SparkContext(object):
         >>> sorted(textFiles.collect())
         [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]
         """
-        return RDD(self._jsc.wholeTextFiles(path), self,
+        minPartitions = minPartitions or self.defaultMinPartitions
+        return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
                    PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
 
     def _checkpointFile(self, name, input_deserializer):
-- 
GitLab