MongoSpark 28799错误

Exception in thread "main" com.mongodb.MongoCommandException: Command failed with error 28799: 'Received error in response from 192.168.12.161:27018: { $err: "$sample stage could not find a non-duplicate document after 100 while using a random cursor. This is likely a sporadic failure, please try again.", code: 28799 }' on server 192.168.12.161:27017. The full response is { "ok" : 0.0, "errmsg" : "Received error in response from 192.168.12.161:27018: { $err: \"$sample stage could not find a non-duplicate document after 100 while using a random cursor. This is likely a sporadic failure, please try again.\", code: 28799 }", "code" : 28799, "codeName" : "Location28799" }
    at com.mongodb.connection.ProtocolHelper.getCommandFailureException(ProtocolHelper.java:115)
    at com.mongodb.connection.CommandProtocol.execute(CommandProtocol.java:114)
    at com.mongodb.connection.DefaultServer$DefaultServerProtocolExecutor.execute(DefaultServer.java:168)
    at com.mongodb.connection.DefaultServerConnection.executeProtocol(DefaultServerConnection.java:289)
    at com.mongodb.connection.DefaultServerConnection.command(DefaultServerConnection.java:176)
    at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:216)
    at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:207)
    at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:113)
    at com.mongodb.operation.AggregateOperation$1.call(AggregateOperation.java:257)
    at com.mongodb.operation.AggregateOperation$1.call(AggregateOperation.java:253)
    at com.mongodb.operation.OperationHelper.withConnectionSource(OperationHelper.java:431)
    at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:404)
    at com.mongodb.operation.AggregateOperation.execute(AggregateOperation.java:253)
    at com.mongodb.operation.AggregateOperation.execute(AggregateOperation.java:67)
    at com.mongodb.Mongo.execute(Mongo.java:836)
    at com.mongodb.Mongo$2.execute(Mongo.java:823)
    at com.mongodb.OperationIterable.iterator(OperationIterable.java:47)
    at com.mongodb.OperationIterable.forEach(OperationIterable.java:70)
    at com.mongodb.OperationIterable.into(OperationIterable.java:82)
    at com.mongodb.AggregateIterableImpl.into(AggregateIterableImpl.java:143)
    at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner$$anonfun$8.apply(MongoSamplePartitioner.scala:103)
    at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner$$anonfun$8.apply(MongoSamplePartitioner.scala:97)
    at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:186)
    at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:184)
    at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
    at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
    at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
    at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
    at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
    at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner.partitions(MongoSamplePartitioner.scala:96)
    at com.mongodb.spark.rdd.partitioner.DefaultMongoPartitioner.partitions(DefaultMongoPartitioner.scala:34)
    at com.mongodb.spark.rdd.MongoRDD.getPartitions(MongoRDD.scala:137)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
    at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
    at org.jh.TestSpark$.doTest(DocHandler.scala:17)
    at org.jh.TestSpark$.main(DocHandler.scala:29)
    at org.jh.TestSpark.main(DocHandler.scala)

错误如上,解决方式如下,根据connector源码(并没有完全看懂),分析出现这个问题的原因是因为:

if (numDocumentsPerPartition >= count) {
          MongoSinglePartitioner.partitions(connector, readConfig, pipeline)
        } else {
          val samples = connector.withCollectionDo(readConfig, {
            coll: MongoCollection[BsonDocument] =>
              coll.aggregate(List(
                Aggregates.`match`(matchQuery),
                Aggregates.sample(numberOfSamples),
                Aggregates.project(Projections.include(partitionKey)),
                Aggregates.sort(Sorts.ascending(partitionKey))
              ).asJava).allowDiskUse(true).into(new util.ArrayList[BsonDocument]()).asScala
          })
          def collectSplit(i: Int): Boolean = (i % samplesPerPartition == 0) || !matchQuery.isEmpty && i == count - 1
          val rightHandBoundaries = samples.zipWithIndex.collect {
            case (field, i) if collectSplit(i) => field.get(partitionKey)
          }
          PartitionerHelper.createPartitions(partitionKey, rightHandBoundaries, PartitionerHelper.locations(connector))
        }

  numDocumentsPerPartition < count,导致执行了else代码出现的,else先进行sample,然后:

val numDocumentsPerPartition: Int = math.floor(partitionSizeInBytes.toFloat / avgObjSizeInBytes).toInt
val numberOfSamples = math.floor(samplesPerPartition * count / numDocumentsPerPartition.toFloat).toInt

  为了避免出错,所以要降低numberOfSamples,那么就需要降低samplesPerPartition,增加numDocumentsPerPartition,samplesPerPartition通过调低spark.mongodb.input.partitionerOptions.samplesPerPartition实现,增加numDocumentsPerPartition通过调大spark.mongodb.input.partitionerOptions.partitionSizeMB实现。并且调大spark.mongodb.input.partitionerOptions.partitionSizeMB会提高numDocumentsPerPartition的数值,可以避免进入else下面的代码块。

  所以解决方案如下:

SparkSession.builder()
//			.master("local")
			.master(sparkURI)
			.config(new SparkConf().setJars(Array(s"${hdfsURI}/mongolib/mongo-spark-connector_2.11-2.2.1.jar",
					s"${hdfsURI}/mongolib/bson-3.4.2.jar",
					s"${hdfsURI}/mongolib/mongo-java-driver-3.4.2.jar",
					s"${hdfsURI}/mongolib/mongodb-driver-3.4.2.jar",
					s"${hdfsURI}/mongolib/mongodb-driver-core-3.4.2.jar",
					s"${hdfsURI}/mongolib/commons-io-2.5.jar",
					s"${hdfsURI}/mongolib/config-1.2.1.jar",
					s"${hdfsURI}/${jarName}") ++ extJars))  	  
			.config("spark.cores.max", 80)		
			.config("spark.executor.cores", 16)
			.config("spark.executor.memory", "32g")
			.config("spark.mongodb.input.uri", inp)
			.config("spark.mongodb.output.uri", oup)
			.config("spark.mongodb.input.partitionerOptions.samplesPerPartition", 1)
			.config("spark.mongodb.input.partitionerOptions.partitionSizeMB", 128)			
			.getOrCreate()

  

posted @ 2018-01-30 10:17  月影舞华  阅读(1982)  评论(0编辑  收藏  举报