From f18fd05b513b136363c94adb3e5b841f8bf48134 Mon Sep 17 00:00:00 2001 From: Kan Zhang <kzhang@apache.org> Date: Wed, 21 May 2014 13:26:53 -0700 Subject: [PATCH] [SPARK-1519] Support minPartitions param of wholeTextFiles() in PySpark Author: Kan Zhang <kzhang@apache.org> Closes #697 from kanzhang/SPARK-1519 and squashes the following commits: 4f8d1ed [Kan Zhang] [SPARK-1519] Support minPartitions param of wholeTextFiles() in PySpark --- python/pyspark/context.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index cac133d0fc..c9ff82d23b 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -211,6 +211,13 @@ class SparkContext(object): """ return self._jsc.sc().defaultParallelism() + @property + def defaultMinPartitions(self): + """ + Default min number of partitions for Hadoop RDDs when not given by user + """ + return self._jsc.sc().defaultMinPartitions() + def __del__(self): self.stop() @@ -264,7 +271,7 @@ class SparkContext(object): return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer()) - def wholeTextFiles(self, path): + def wholeTextFiles(self, path, minPartitions=None): """ Read a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system @@ -300,7 +307,8 @@ class SparkContext(object): >>> sorted(textFiles.collect()) [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')] """ - return RDD(self._jsc.wholeTextFiles(path), self, + minPartitions = minPartitions or self.defaultMinPartitions + return RDD(self._jsc.wholeTextFiles(path, minPartitions), self, PairDeserializer(UTF8Deserializer(), UTF8Deserializer())) def _checkpointFile(self, name, input_deserializer): -- GitLab