From 75663b57f90bb173f0c6c288944ec568c4719b2a Mon Sep 17 00:00:00 2001 From: Davies Liu <davies.liu@gmail.com> Date: Sat, 26 Jul 2014 01:07:08 -0700 Subject: [PATCH] [SPARK-2652] [PySpark] Turning some default configs for PySpark Add several default configs for PySpark, related to serialization in JVM. spark.serializer = org.apache.spark.serializer.KryoSerializer spark.serializer.objectStreamReset = 100 spark.rdd.compress = True This will help to reduce the memory usage during RDD.partitionBy() Author: Davies Liu <davies.liu@gmail.com> Closes #1568 from davies/conf and squashes the following commits: cd316f1 [Davies Liu] remove duplicated line f71a355 [Davies Liu] rebase to master, add spark.rdd.compress = True 8f63f45 [Davies Liu] Merge branch 'master' into conf 8bc9f08 [Davies Liu] fix unittest c04a83d [Davies Liu] some default configs for PySpark --- python/pyspark/context.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index bdf14ea0ee..e8ac9895cf 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -37,6 +37,15 @@ from pyspark.rdd import RDD from py4j.java_collections import ListConverter +# These are special default configs for PySpark, they will overwrite +# the default ones for Spark if they are not configured by user. +DEFAULT_CONFIGS = { + "spark.serializer": "org.apache.spark.serializer.KryoSerializer", + "spark.serializer.objectStreamReset": 100, + "spark.rdd.compress": True, +} + + class SparkContext(object): """ Main entry point for Spark functionality. A SparkContext represents the @@ -101,7 +110,7 @@ class SparkContext(object): else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) - self._conf.setIfMissing("spark.rdd.compress", "true") + # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) @@ -112,6 +121,8 @@ class SparkContext(object): if environment: for key, value in environment.iteritems(): self._conf.setExecutorEnv(key, value) + for key, value in DEFAULT_CONFIGS.items(): + self._conf.setIfMissing(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): -- GitLab