【Flink提交流程源码】五、Dispatcher 在onStart方法去启动JobMaster

一、Dispatcher  的onStart 启动服务

 

//把自己启动并注册
startDispatcherServices();
//启动jobMaster,这一步重要
startRecoveredJobs();
this.dispatcherBootstrap = this.dispatcherBootstrapFactory.create(
getSelfGateway(DispatcherGateway.class),
this.getRpcService().getScheduledExecutor() ,
this::onFatalError);


  

进入

startDispatcherServices(); 注册
registerDispatcherMetrics(jobManagerMetricGroup);

  

进入

startRecoveredJobs();
private void startRecoveredJobs() {
		for (JobGraph recoveredJob : recoveredJobs) {
			runRecoveredJob(recoveredJob);
		}
		recoveredJobs.clear();
	}

runJob(recoveredJob, ExecutionType.RECOVERY);

  进入

private void runJob(JobGraph jobGraph, ExecutionType executionType) {
		Preconditions.checkState(!runningJobs.containsKey(jobGraph.getJobID()));
		long initializationTimestamp = System.currentTimeMillis();
//1.创建jobManage CompletableFuture<JobManagerRunner> jobManagerRunnerFuture = createJobManagerRunner(jobGraph, initializationTimestamp); DispatcherJob dispatcherJob = DispatcherJob.createFor( jobManagerRunnerFuture, jobGraph.getJobID(), jobGraph.getName(), initializationTimestamp); runningJobs.put(jobGraph.getJobID(), dispatcherJob); final JobID jobId = jobGraph.getJobID(); final CompletableFuture<CleanupJobState> cleanupJobStateFuture = dispatcherJob.getResultFuture().handleAsync( (dispatcherJobResult, throwable) -> { Preconditions.checkState(runningJobs.get(jobId) == dispatcherJob, "The job entry in runningJobs must be bound to the lifetime of the DispatcherJob."); if (dispatcherJobResult != null) { return handleDispatcherJobResult(jobId, dispatcherJobResult, executionType); } else { return dispatcherJobFailed(jobId, throwable); } }, getMainThreadExecutor()); final CompletableFuture<Void> jobTerminationFuture = cleanupJobStateFuture .thenApply(cleanupJobState -> removeJob(jobId, cleanupJobState)) .thenCompose(Function.identity()); FutureUtils.assertNoException(jobTerminationFuture); registerDispatcherJobTerminationFuture(jobId, jobTerminationFuture); }

  

这一步是关键

1.创建jobManage
CompletableFuture<JobManagerRunner> createJobManagerRunner(JobGraph jobGraph, long initializationTimestamp) {
		final RpcService rpcService = getRpcService();
		return CompletableFuture.supplyAsync(
			() -> {
				try {
//创建jobManage JobManagerRunner runner = jobManagerRunnerFactory.createJobManagerRunner( jobGraph, configuration, rpcService, highAvailabilityServices, heartbeatServices, jobManagerSharedServices, new DefaultJobManagerJobMetricGroupFactory(jobManagerMetricGroup), fatalErrorHandler, initializationTimestamp);

//启动jobMatster runner.start();

  

找实现类
DefaultJobManagerRunnerFactory

public JobManagerRunner createJobManagerRunner(
			JobGraph jobGraph,
			Configuration configuration,
			RpcService rpcService,
			HighAvailabilityServices highAvailabilityServices,
			HeartbeatServices heartbeatServices,
			JobManagerSharedServices jobManagerServices,
			JobManagerJobMetricGroupFactory jobManagerJobMetricGroupFactory,
			FatalErrorHandler fatalErrorHandler,
			long initializationTimestamp) throws Exception {

		final JobMasterConfiguration jobMasterConfiguration = JobMasterConfiguration.fromConfiguration(configuration);

		final SlotPoolFactory slotPoolFactory = SlotPoolFactory.fromConfiguration(configuration);
		final SchedulerNGFactory schedulerNGFactory = SchedulerNGFactoryFactory.createSchedulerNGFactory(configuration);
		final ShuffleMaster<?> shuffleMaster = ShuffleServiceLoader.loadShuffleServiceFactory(configuration).createShuffleMaster(configuration);

		final JobMasterServiceFactory jobMasterFactory = new DefaultJobMasterServiceFactory(
			jobMasterConfiguration,
			slotPoolFactory,
			rpcService,
			highAvailabilityServices,
			jobManagerServices,
			heartbeatServices,
			jobManagerJobMetricGroupFactory,
			fatalErrorHandler,
			schedulerNGFactory,
			shuffleMaster);

		return new JobManagerRunnerImpl(
			jobGraph,
			jobMasterFactory,
			highAvailabilityServices,
			jobManagerServices.getLibraryCacheManager().registerClassLoaderLease(jobGraph.getJobID()),
			jobManagerServices.getScheduledExecutorService(),
			fatalErrorHandler,
			initializationTimestamp);
	}

  

进入JobManagerRunnerImpl
this.jobMasterService = jobMasterFactory.createJobMasterService(jobGraph, this, userCodeLoader, initializationTimestamp);



public void start() throws Exception {
try {
leaderElectionService.start(this);
} catch (Exception e) {
log.error("Could not start the JobManager because the leader election service did not start.", e);
throw new Exception("Could not start the leader election service.", e);
}
}



 进入

DefaultJobMasterServiceFactory
直到这就完成创建JobMaster 就可以了,不用再往下了

new JobMaster(
			rpcService,
			jobMasterConfiguration,
			ResourceID.generate(),
			jobGraph,
			haServices,
			slotPoolFactory,
			jobManagerSharedServices,
			heartbeatServices,
			jobManagerJobMetricGroupFactory,
			jobCompletionActions,
			fatalErrorHandler,
			userCodeClassloader,
			schedulerNGFactory,
			shuffleMaster,
			lookup -> new JobMasterPartitionTrackerImpl(
				jobGraph.getJobID(),
				shuffleMaster,
				lookup
			),
			new DefaultExecutionDeploymentTracker(),
			DefaultExecutionDeploymentReconciler::new,
			initializationTimestamp);

  

runner.start();进入实现类
JobManagerRunnerImpl

@Override
	public void start() throws Exception {
		try {
			leaderElectionService.start(this);
		} catch (Exception e) {
			log.error("Could not start the JobManager because the leader election service did not start.", e);
			throw new Exception("Could not start the leader election service.", e);
		}
	}

  

也会进入
StandaloneLeaderElectionService类
@Override
	public void start(LeaderContender newContender) throws Exception {
		if (contender != null) {
			// Service was already started
			throw new IllegalArgumentException("Leader election service cannot be started multiple times.");
		}

		contender = Preconditions.checkNotNull(newContender);

		// directly grant leadership to the given contender
		contender.grantLeadership(HighAvailabilityServices.DEFAULT_LEADER_ID);
	}

  

进入
JobManagerRunnerImpl
public void grantLeadership(final UUID leaderSessionID) {
		synchronized (lock) {
			if (shutdown) {
				log.debug("JobManagerRunner cannot be granted leadership because it is already shut down.");
				return;
			}

			leadershipOperation = leadershipOperation.thenCompose(
				(ignored) -> {
					synchronized (lock) {
						return verifyJobSchedulingStatusAndStartJobManager(leaderSessionID);
					}
				});

			handleException(leadershipOperation, "Could not start the job manager.");
		}
	}

  启动JobMaster

private CompletableFuture<Void> verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) {
final CompletableFuture<JobSchedulingStatus> jobSchedulingStatusFuture = getJobSchedulingStatus();

return jobSchedulingStatusFuture.thenCompose(
jobSchedulingStatus -> {
if (jobSchedulingStatus == JobSchedulingStatus.DONE) {
return jobAlreadyDone();
} else {
//启动JobMaster
return startJobMaster(leaderSessionId);
}
});
}

  

	final CompletableFuture<Acknowledge> startFuture;
		try {
//jobMaster开始 startFuture = jobMasterService.start(new JobMasterId(leaderSessionId)); } catch (Exception e) { return FutureUtils.completedExceptionally(new FlinkException("Failed to start the JobMaster.", e)); }

  

JobMasterService

start()

  

找到子类实现
JobMaster
public CompletableFuture<Acknowledge> start(final JobMasterId newJobMasterId) throws Exception {
// make sure we receive RPC and async calls
start();
//这个地方执行
return callAsyncWithoutFencing(() -> startJobExecution(newJobMasterId), RpcUtils.INF_TIMEOUT);
}

  进入

private Acknowledge startJobExecution(JobMasterId newJobMasterId) throws Exception {

		validateRunsInMainThread();

		checkNotNull(newJobMasterId, "The new JobMasterId must not be null.");

		if (Objects.equals(getFencingToken(), newJobMasterId)) {
			log.info("Already started the job execution with JobMasterId {}.", newJobMasterId);

			return Acknowledge.get();
		}

		setNewFencingToken(newJobMasterId);
//真正开始启动 startJobMasterServices();
//重置启动调度器
resetAndStartScheduler();

  到下一节启动resouceManage

posted @ 2021-06-18 20:32  持枢  阅读(489)  评论(0)    收藏  举报