diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala index c7ecd6125d8606230784c0516779e624800b6522..3e0d484b74cfedb17482ff60839d3a7c2359b0a4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala @@ -57,15 +57,21 @@ private[sql] abstract class BucketedOutputWriterFactory extends OutputWriterFact private[sql] object BucketingUtils { // The file name of bucketed data should have 3 parts: - // 1. some other information in the head of file name, ends with `-` - // 2. bucket id part, some numbers + // 1. some other information in the head of file name + // 2. bucket id part, some numbers, starts with "_" + // * The other-information part may use `-` as separator and may have numbers at the end, + // e.g. a normal parquet file without bucketing may have name: + // part-r-00000-2dd664f9-d2c4-4ffe-878f-431234567891.gz.parquet, and we will mistakenly + // treat `431234567891` as bucket id. So here we pick `_` as separator. // 3. optional file extension part, in the tail of file name, starts with `.` // An example of bucketed parquet file name with bucket id 3: - // part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb-00003.gz.parquet - private val bucketedFileName = """.*-(\d+)(?:\..*)?$""".r + // part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet + private val bucketedFileName = """.*_(\d+)(?:\..*)?$""".r def getBucketId(fileName: String): Option[Int] = fileName match { case bucketedFileName(bucketId) => Some(bucketId.toInt) case other => None } + + def bucketIdToString(id: Int): String = f"_$id%05d" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala index 20c60b9c43e10420f87b884fa9fa47af64b09a35..31c5620c9a80ec052e257ce557e2e6fdef6f1200 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -193,7 +193,7 @@ private[json] class JsonOutputWriter( val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId - val bucketString = bucketId.map(id => f"-$id%05d").getOrElse("") + val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension") } }.getRecordWriter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index 30ddec686c92154d9cda79e1cc1f590b95394c29..b460ec1d2604788cb8a5dc0f52a52bb696de4668 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -90,7 +90,7 @@ private[sql] class ParquetOutputWriter( val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId - val bucketString = bucketId.map(id => f"-$id%05d").getOrElse("") + val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension") } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala index 40409169b095a58160bc942f42fdfa1a948af1fe..800823febab267ec76f4e1eb5fc1ae648e3f979f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala @@ -103,7 +103,7 @@ private[orc] class OrcOutputWriter( val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID") val taskAttemptId = context.getTaskAttemptID val partition = taskAttemptId.getTaskID.getId - val bucketString = bucketId.map(id => f"-$id%05d").getOrElse("") + val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") val filename = f"part-r-$partition%05d-$uniqueWriteJobId$bucketString.orc" new OrcOutputFormat().getRecordWriter(