Flink - Checkpoint - fxjwind

Flink - Checkpoint

Flink在流上最大的特点，就是引入全局snapshot，

CheckpointCoordinator

做snapshot的核心组件为， CheckpointCoordinator

/**
 * The checkpoint coordinator coordinates the distributed snapshots of operators and state.
 * It triggers the checkpoint by sending the messages to the relevant tasks and collects the
 * checkpoint acknowledgements. It also collects and maintains the overview of the state handles
 * reported by the tasks that acknowledge the checkpoint.
 *
 * <p>Depending on the configured {@link RecoveryMode}, the behaviour of the {@link
 * CompletedCheckpointStore} and {@link CheckpointIDCounter} change. The default standalone
 * implementations don't support any recovery.
 */
public class CheckpointCoordinator {

    /** Tasks who need to be sent a message when a checkpoint is started */
    private final ExecutionVertex[] tasksToTrigger; //需要触发checkpoint的tasks

    /** Tasks who need to acknowledge a checkpoint before it succeeds */
    private final ExecutionVertex[] tasksToWaitFor;

    /** Tasks who need to be sent a message when a checkpoint is confirmed */
    private final ExecutionVertex[] tasksToCommitTo;

    /** Map from checkpoint ID to the pending checkpoint */
    private final Map<Long, PendingCheckpoint> pendingCheckpoints;

    /** Completed checkpoints. Implementations can be blocking. Make sure calls to methods
     * accessing this don't block the job manager actor and run asynchronously. */
    private final CompletedCheckpointStore completedCheckpointStore;  //用于记录已经完成的checkpoints

    /** A list of recent checkpoint IDs, to identify late messages (vs invalid ones) */
    private final ArrayDeque<Long> recentPendingCheckpoints;

    /** Checkpoint ID counter to ensure ascending IDs. In case of job manager failures, these
     * need to be ascending across job managers. */
    protected final CheckpointIDCounter checkpointIdCounter; //保证产生递增的checkpoint id，即使当jobmanager crash，也有保证全局checkpoint id是递增的

    /** The base checkpoint interval. Actual trigger time may be affected by the
     * max concurrent checkpoints and minimum-pause values */
    private final long baseInterval;  //触发checkpoint的时间间隔

    /** The max time (in ms) that a checkpoint may take */
    private final long checkpointTimeout; //一次checkpoint消耗的最大时间，超过，我们就可以认为该checkpoint超时失败

    /** The min time(in ms) to delay after a checkpoint could be triggered. Allows to
     * enforce minimum processing time between checkpoint attempts */
    private final long minPauseBetweenCheckpoints; //checkpoint之间的最小间隔

    /** The maximum number of checkpoints that may be in progress at the same time */
    private final int maxConcurrentCheckpointAttempts; //最多同时存在多少checkpoint

    /** Actor that receives status updates from the execution graph this coordinator works for */
    private ActorGateway jobStatusListener;

    /** The number of consecutive failed trigger attempts */
    private int numUnsuccessfulCheckpointsTriggers;

    private ScheduledTrigger currentPeriodicTrigger;

    /** Flag whether a triggered checkpoint should immediately schedule the next checkpoint.
     * Non-volatile, because only accessed in synchronized scope */
    private boolean periodicScheduling;

    /** Flag whether a trigger request could not be handled immediately. Non-volatile, because only
     * accessed in synchronized scope */
    private boolean triggerRequestQueued;

    /** Flag marking the coordinator as shut down (not accepting any messages any more) */
    private volatile boolean shutdown; //注意是volatile，保证可见性

    /** Shutdown hook thread to clean up state handles. */
    private final Thread shutdownHook;

    /** Helper for tracking checkpoint statistics  */
    private final CheckpointStatsTracker statsTracker;


    public CheckpointCoordinator(
            JobID job,
            long baseInterval,
            long checkpointTimeout,
            long minPauseBetweenCheckpoints,
            int maxConcurrentCheckpointAttempts,
            ExecutionVertex[] tasksToTrigger,
            ExecutionVertex[] tasksToWaitFor,
            ExecutionVertex[] tasksToCommitTo,
            ClassLoader userClassLoader,
            CheckpointIDCounter checkpointIDCounter,
            CompletedCheckpointStore completedCheckpointStore,
            RecoveryMode recoveryMode,
            CheckpointStatsTracker statsTracker) throws Exception {

        checkpointIDCounter.start(); //开启CheckpointIDCounter

        this.timer = new Timer("Checkpoint Timer", true);

        this.statsTracker = checkNotNull(statsTracker);

        if (recoveryMode == RecoveryMode.STANDALONE) { // 如果是standalone模式，需要加上shutdownHook来清理state
            // Add shutdown hook to clean up state handles when no checkpoint recovery is
            // possible. In case of another configured recovery mode, the checkpoints need to be
            // available for the standby job managers.
            this.shutdownHook = new Thread(new Runnable() {
                @Override
                public void run() {
                    try {
                        CheckpointCoordinator.this.shutdown(); //显示的调用shutdown
                    }
                    catch (Throwable t) {
                        LOG.error("Error during shutdown of checkpoint coordinator via " +
                                "JVM shutdown hook: " + t.getMessage(), t);
                    }
                }
            });

            try {
                // Add JVM shutdown hook to call shutdown of service
                Runtime.getRuntime().addShutdownHook(shutdownHook);
            }
            catch (IllegalStateException ignored) {
                // JVM is already shutting down. No need to do anything.
            }
            catch (Throwable t) {
                LOG.error("Cannot register checkpoint coordinator shutdown hook.", t);
            }
        }
        else {
            this.shutdownHook = null;
        }
    }

CheckpointIDCounter

有两种，

StandaloneCheckpointIDCounter

这种case下的，counter，只是用AtomicLong来是实现的，那JobManager如果挂了，那这个值可能是丢了的，重启后，应该是无法保证递增的

但这里说，在standalone的情况下，不需要做recovery，所以这个是可以接受的

/**
 * {@link CheckpointIDCounter} instances for JobManagers running in {@link RecoveryMode#STANDALONE}.
 *
 * <p>Simple wrapper of an {@link AtomicLong}. This is sufficient, because job managers are not
 * recoverable in this recovery mode.
 */
public class StandaloneCheckpointIDCounter implements CheckpointIDCounter {

    private final AtomicLong checkpointIdCounter = new AtomicLong(1);

    @Override
    public void start() throws Exception {
    }

    @Override
    public void stop() throws Exception {
    }

    @Override
    public long getAndIncrement() throws Exception {
        return checkpointIdCounter.getAndIncrement();
    }

    @Override
    public void setCount(long newCount) {
        checkpointIdCounter.set(newCount);
    }
}

ZooKeeperCheckpointIDCounter

这种counter用zk的persistent node来保存当前的计数，以保证计数的递增

/**
 * {@link CheckpointIDCounter} instances for JobManagers running in {@link RecoveryMode#ZOOKEEPER}.
 *
 * <p>Each counter creates a ZNode:
 * <pre>
 * +----O /flink/checkpoint-counter/&lt;job-id&gt; 1 [persistent]
 * .
 * .
 * .
 * +----O /flink/checkpoint-counter/&lt;job-id&gt; N [persistent]
 * </pre>
 *
 * <p>The checkpoints IDs are required to be ascending (per job). In order to guarantee this in case
 * of job manager failures we use ZooKeeper to have a shared counter across job manager instances.
 */
public class ZooKeeperCheckpointIDCounter implements CheckpointIDCounter

CompletedCheckpointStore

接口，用于记录有哪些已经完成的checkpoint

/**
 * A bounded LIFO-queue of {@link CompletedCheckpoint} instances.
 */
public interface CompletedCheckpointStore {

    /**
     * Recover available {@link CompletedCheckpoint} instances.
     *
     * <p>After a call to this method, {@link #getLatestCheckpoint()} returns the latest
     * available checkpoint.
     */
    void recover() throws Exception;

    /**
     * Adds a {@link CompletedCheckpoint} instance to the list of completed checkpoints.
     *
     * <p>Only a bounded number of checkpoints is kept. When exceeding the maximum number of
     * retained checkpoints, the oldest one will be discarded via {@link
     * CompletedCheckpoint#discard(ClassLoader)}.
     */
    void addCheckpoint(CompletedCheckpoint checkpoint) throws Exception;

    /**
     * Returns the latest {@link CompletedCheckpoint} instance or <code>null</code> if none was
     * added.
     */
    CompletedCheckpoint getLatestCheckpoint() throws Exception;

    /**
     * Discards all added {@link CompletedCheckpoint} instances via {@link
     * CompletedCheckpoint#discard(ClassLoader)}.
     */
    void discardAllCheckpoints() throws Exception;

    /**
     * Returns all {@link CompletedCheckpoint} instances.
     *
     * <p>Returns an empty list if no checkpoint has been added yet.
     */
    List<CompletedCheckpoint> getAllCheckpoints() throws Exception;

    /**
     * Returns the current number of retained checkpoints.
     */
    int getNumberOfRetainedCheckpoints();

}

看下StandaloneCompletedCheckpointStore，其实就是一个用于记录CompletedCheckpoint的ArrayDeque

class StandaloneCompletedCheckpointStore implements CompletedCheckpointStore {

    /** The completed checkpoints. */
    private final ArrayDeque<CompletedCheckpoint> checkpoints;
}

ZooKeeperCompletedCheckpointStore，这个就是用zk来记录

/**
 * {@link CompletedCheckpointStore} for JobManagers running in {@link RecoveryMode#ZOOKEEPER}.
 *
 * <p>Checkpoints are added under a ZNode per job:
 * <pre>
 * +----O /flink/checkpoints/&lt;job-id&gt;  [persistent]
 * .    |
 * .    +----O /flink/checkpoints/&lt;job-id&gt;/1 [persistent]
 * .    .                                  .
 * .    .                                  .
 * .    .                                  .
 * .    +----O /flink/checkpoints/&lt;job-id&gt;/N [persistent]
 * </pre>
 *
 * <p>During recovery, the latest checkpoint is read from ZooKeeper. If there is more than one,
 * only the latest one is used and older ones are discarded (even if the maximum number
 * of retained checkpoints is greater than one).
 *
 * <p>If there is a network partition and multiple JobManagers run concurrent checkpoints for the
 * same program, it is OK to take any valid successful checkpoint as long as the "history" of
 * checkpoints is consistent. Currently, after recovery we start out with only a single
 * checkpoint to circumvent those situations.
 */
public class ZooKeeperCompletedCheckpointStore implements CompletedCheckpointStore {

做snapshot流程

StreamingJobGraphGenerator

配置checkpoint

private void configureCheckpointing() {
    CheckpointConfig cfg = streamGraph.getCheckpointConfig(); //取出Checkpoint的配置
    
    if (cfg.isCheckpointingEnabled()) {
        long interval = cfg.getCheckpointInterval(); //Checkpoint的时间间隔

        // collect the vertices that receive "trigger checkpoint" messages.
        // currently, these are all the sources
        List<JobVertexID> triggerVertices = new ArrayList<JobVertexID>();

        // collect the vertices that need to acknowledge the checkpoint
        // currently, these are all vertices
        List<JobVertexID> ackVertices = new ArrayList<JobVertexID>(jobVertices.size());

        // collect the vertices that receive "commit checkpoint" messages
        // currently, these are all vertices
        List<JobVertexID> commitVertices = new ArrayList<JobVertexID>();
        
        for (JobVertex vertex : jobVertices.values()) {
            if (vertex.isInputVertex()) {  //只有对source vertex，才加入triggerVertices，因为只需要在源头触发checkpoint
                triggerVertices.add(vertex.getID());
            }
            // TODO: add check whether the user function implements the checkpointing interface
            commitVertices.add(vertex.getID()); //当前所有节点都会加入commitVertices和ackVertices
            ackVertices.add(vertex.getID());
        }

        JobSnapshottingSettings settings = new JobSnapshottingSettings( //生成JobSnapshottingSettings
                triggerVertices, ackVertices, commitVertices, interval,
                cfg.getCheckpointTimeout(), cfg.getMinPauseBetweenCheckpoints(),
                cfg.getMaxConcurrentCheckpoints());
        jobGraph.setSnapshotSettings(settings); //调用setSnapshotSettings

        // if the user enabled checkpointing, the default number of exec retries is infinitive.
        int executionRetries = streamGraph.getExecutionConfig().getNumberOfExecutionRetries();
        if(executionRetries == -1) {
            streamGraph.getExecutionConfig().setNumberOfExecutionRetries(Integer.MAX_VALUE);
        }
    }
}

JobManager

submitJob的时候，将JobGraph中的配置，放到ExecutionGraph中去

private def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = false): Unit = {

    // configure the state checkpointing
    val snapshotSettings = jobGraph.getSnapshotSettings
    if (snapshotSettings != null) {
        val jobId = jobGraph.getJobID()
        
        val idToVertex: JobVertexID => ExecutionJobVertex = id => {
        val vertex = executionGraph.getJobVertex(id)
        if (vertex == null) {
          throw new JobSubmissionException(jobId,
            "The snapshot checkpointing settings refer to non-existent vertex " + id)
        }
        vertex
    }
    
    val triggerVertices: java.util.List[ExecutionJobVertex] =
        snapshotSettings.getVerticesToTrigger().asScala.map(idToVertex).asJava
    
    val ackVertices: java.util.List[ExecutionJobVertex] =
        snapshotSettings.getVerticesToAcknowledge().asScala.map(idToVertex).asJava
    
    val confirmVertices: java.util.List[ExecutionJobVertex] =
        snapshotSettings.getVerticesToConfirm().asScala.map(idToVertex).asJava
    
    val completedCheckpoints = checkpointRecoveryFactory
        .createCompletedCheckpoints(jobId, userCodeLoader)
    
    val checkpointIdCounter = checkpointRecoveryFactory.createCheckpointIDCounter(jobId)
    
    executionGraph.enableSnapshotCheckpointing(
        snapshotSettings.getCheckpointInterval,
        snapshotSettings.getCheckpointTimeout,
        snapshotSettings.getMinPauseBetweenCheckpoints,
        snapshotSettings.getMaxConcurrentCheckpoints,
        triggerVertices,
        ackVertices,
        confirmVertices,
        context.system,
        leaderSessionID.orNull,
        checkpointIdCounter,
        completedCheckpoints,
        recoveryMode,
        savepointStore)
    }
}

ExecutionGraph

创建checkpointCoordinator对象

public void enableSnapshotCheckpointing(
        long interval,
        long checkpointTimeout,
        long minPauseBetweenCheckpoints,
        int maxConcurrentCheckpoints,
        List<ExecutionJobVertex> verticesToTrigger,
        List<ExecutionJobVertex> verticesToWaitFor,
        List<ExecutionJobVertex> verticesToCommitTo,
        ActorSystem actorSystem,
        UUID leaderSessionID,
        CheckpointIDCounter checkpointIDCounter,
        CompletedCheckpointStore completedCheckpointStore,
        RecoveryMode recoveryMode,
        StateStore<Savepoint> savepointStore) throws Exception {

    ExecutionVertex[] tasksToTrigger = collectExecutionVertices(verticesToTrigger);
    ExecutionVertex[] tasksToWaitFor = collectExecutionVertices(verticesToWaitFor);
    ExecutionVertex[] tasksToCommitTo = collectExecutionVertices(verticesToCommitTo);
    
    // disable to make sure existing checkpoint coordinators are cleared
    disableSnaphotCheckpointing();

    if (isStatsDisabled) {
        checkpointStatsTracker = new DisabledCheckpointStatsTracker();
    }
    else {
        int historySize = jobConfiguration.getInteger(
                ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE,
                ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE);

        checkpointStatsTracker = new SimpleCheckpointStatsTracker(historySize, tasksToWaitFor);
    }

    // create the coordinator that triggers and commits checkpoints and holds the state
    checkpointCoordinator = new CheckpointCoordinator(
            jobID,
            interval,
            checkpointTimeout,
            minPauseBetweenCheckpoints,
            maxConcurrentCheckpoints,
            tasksToTrigger,
            tasksToWaitFor,
            tasksToCommitTo,
            userClassLoader,
            checkpointIDCounter,
            completedCheckpointStore,
            recoveryMode,
            checkpointStatsTracker);
    
    // the periodic checkpoint scheduler is activated and deactivated as a result of
    // job status changes (running -> on, all other states -> off)
    registerJobStatusListener( //将checkpointCoordinator的actor注册到jobStatusListenerActors，这样当job状态变化时，可以通知checkpointCoordinator
            checkpointCoordinator.createActivatorDeactivator(actorSystem, leaderSessionID));

这里看到checkpointCoordinator 作为ExecutionGraph的成员，

接着会异步的提交ExecutionGraph，

// execute the recovery/writing the jobGraph into the SubmittedJobGraphStore asynchronously
// because it is a blocking operation
future {
    try {
      if (isRecovery) {
        executionGraph.restoreLatestCheckpointedState() //恢复CheckpointedState
      }
      else {
        //...... 
      }
        submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo)) //把jobGraph放到submittedJobGraphs中track
      }
    
      jobInfo.client ! decorateMessage(JobSubmitSuccess(jobGraph.getJobID)) //告诉client，job提交成功
    
      if (leaderElectionService.hasLeadership) {
        executionGraph.scheduleForExecution(scheduler) //真正的调度executionGraph
      } else {
        //......
      }
    } catch {
      //.......
    }
}(context.dispatcher)

CheckpointCoordinatorDeActivator

/**
 * This actor listens to changes in the JobStatus and activates or deactivates the periodic
 * checkpoint scheduler.
 */
public class CheckpointCoordinatorDeActivator extends FlinkUntypedActor {

    private final CheckpointCoordinator coordinator;
    private final UUID leaderSessionID;

    @Override
    public void handleMessage(Object message) {
        if (message instanceof ExecutionGraphMessages.JobStatusChanged) {
            JobStatus status = ((ExecutionGraphMessages.JobStatusChanged) message).newJobStatus();
            
            if (status == JobStatus.RUNNING) {
                // start the checkpoint scheduler
                coordinator.startCheckpointScheduler();
            } else {
                // anything else should stop the trigger for now
                coordinator.stopCheckpointScheduler();
            }
        }
        
        // we ignore all other messages
    }

    @Override
    public UUID getLeaderSessionID() {
        return leaderSessionID;
    }
}

在job状态发生变化时，需要打开或关闭Checkpoint scheduler

CheckpointCoordinator

开启定时startCheckpointScheduler

public void startCheckpointScheduler() {
    synchronized (lock) {
        // make sure all prior timers are cancelled
        stopCheckpointScheduler();

        periodicScheduling = true;
        currentPeriodicTrigger = new ScheduledTrigger();
        timer.scheduleAtFixedRate(currentPeriodicTrigger, baseInterval, baseInterval);
    }
}

private class ScheduledTrigger extends TimerTask {

    @Override
    public void run() {
        try {
            triggerCheckpoint(System.currentTimeMillis());
        }
        catch (Exception e) {
            LOG.error("Exception while triggering checkpoint", e);
        }
    }
}

triggerCheckpoint，用于触发一次checkpoint

/**
 * Triggers a new checkpoint and uses the given timestamp as the checkpoint
 * timestamp.
 *
 * @param timestamp The timestamp for the checkpoint.
 * @param nextCheckpointId The checkpoint ID to use for this checkpoint or <code>-1</code> if
 *                         the checkpoint ID counter should be queried.
 */
public boolean triggerCheckpoint(long timestamp, long nextCheckpointId) throws Exception {

    // we will actually trigger this checkpoint!
    final long checkpointID;
    if (nextCheckpointId < 0) {
        try {
            // this must happen outside the locked scope, because it communicates
            // with external services (in HA mode) and may block for a while.
            checkpointID = checkpointIdCounter.getAndIncrement();
        }
        catch (Throwable t) {

        }
    }
    else {
        checkpointID = nextCheckpointId;
    }

    //对于没有开始的Checkpoint，称为PendingCheckpoint，传入所有需要ack checkpoint的ackTasks
    //后续会一个个ack这些tasks，当所有的ackTasks都被acked，PendingCheckpoint就变成CompletedCheckpoint
    final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks);

    // schedule the timer that will clean up the expired checkpoints，定期去清理过期的checkpoint
    TimerTask canceller = new TimerTask() {
        @Override
        public void run() {
            try {
                synchronized (lock) {
                    // only do the work if the checkpoint is not discarded anyways
                    // note that checkpoint completion discards the pending checkpoint object
                    if (!checkpoint.isDiscarded()) {
                        LOG.info("Checkpoint " + checkpointID + " expired before completing.");

                        checkpoint.discard(userClassLoader);
                        pendingCheckpoints.remove(checkpointID);
                        rememberRecentCheckpointId(checkpointID);

                        onCancelCheckpoint(checkpointID);

                        triggerQueuedRequests();
                    }
                }
            }
            catch (Throwable t) {
                LOG.error("Exception while handling checkpoint timeout", t);
            }
        }
    };

    try {
        // re-acquire the lock
        synchronized (lock) {
            pendingCheckpoints.put(checkpointID, checkpoint); //将该PendingCheckpoint加入列表track
            timer.schedule(canceller, checkpointTimeout);  //并且启动canceller
        }
        // end of lock scope

        // send the messages to the tasks that trigger their checkpoint
        for (int i = 0; i < tasksToTrigger.length; i++) {
            ExecutionAttemptID id = triggerIDs[i];
            TriggerCheckpoint message = new TriggerCheckpoint(job, id, checkpointID, timestamp);
            tasksToTrigger[i].sendMessageToCurrentExecution(message, id); //给所有的需要触发checkpoint的task发送checkpoint message，这里只是source tasks
        }

        numUnsuccessfulCheckpointsTriggers = 0;
        return true;
    }
    catch (Throwable t) {

    }
}

---------上面只会给所有的source发checkpoint message，所以下面的流程只有source会走到-----------

TaskManager

sendMessageToCurrentExecution，发送的message最终会被TaskManager收到，

/**
   * Handler for messages related to checkpoints.
   *
   * @param actorMessage The checkpoint message.
   */
  private def handleCheckpointingMessage(actorMessage: AbstractCheckpointMessage): Unit = {

    actorMessage match {
      case message: TriggerCheckpoint =>  //如果是triggerCheckpoint
        val taskExecutionId = message.getTaskExecutionId
        val checkpointId = message.getCheckpointId
        val timestamp = message.getTimestamp

        val task = runningTasks.get(taskExecutionId) //从runningTasks中取出真正执行的task
        if (task != null) {
          task.triggerCheckpointBarrier(checkpointId, timestamp) //最终是调用task的triggerCheckpointBarrier
        }

      case message: NotifyCheckpointComplete =>
        val taskExecutionId = message.getTaskExecutionId
        val checkpointId = message.getCheckpointId
        val timestamp = message.getTimestamp


        val task = runningTasks.get(taskExecutionId)
        if (task != null) {
          task.notifyCheckpointComplete(checkpointId) //调用task的notifyCheckpointComplete
        } else {
          log.debug(
            s"TaskManager received a checkpoint confirmation for unknown task $taskExecutionId.")
        }

      // unknown checkpoint message
      case _ => unhandled(actorMessage)
    }
  }

Task

public void triggerCheckpointBarrier(final long checkpointID, final long checkpointTimestamp) {
    AbstractInvokable invokable = this.invokable;

    if (executionState == ExecutionState.RUNNING && invokable != null) {
        if (invokable instanceof StatefulTask) {

            // build a local closure 
            final StatefulTask<?> statefulTask = (StatefulTask<?>) invokable;
            final String taskName = taskNameWithSubtask;

            Runnable runnable = new Runnable() {
                @Override
                public void run() {
                    try {
                        statefulTask.triggerCheckpoint(checkpointID, checkpointTimestamp); //关键就是调用statefulTask的triggerCheckpoint，这个时候task正在执行，所以checkpoint是并行做的
                    }
                    catch (Throwable t) {
                        failExternally(new RuntimeException("Error while triggering checkpoint for " + taskName, t));
                    }
                }
            };
            executeAsyncCallRunnable(runnable, "Checkpoint Trigger for " + taskName);
        }
    }
}

StreamTask

StreamTask就是实现了StatefulTask

所以最终调用到，

StreamTask.triggerCheckpoint,这里面会实际去做checkpoint工作

调用performCheckpoint(checkpointId, timestamp)

protected boolean performCheckpoint(final long checkpointId, final long timestamp) throws Exception {
    
    synchronized (lock) { //加锁，checkpoint需要stop world
        if (isRunning) {

            // Since both state checkpointing and downstream barrier emission occurs in this
            // lock scope, they are an atomic operation regardless of the order in which they occur.
            // Given this, we immediately emit the checkpoint barriers, so the downstream operators
            // can start their checkpoint work as soon as possible
            operatorChain.broadcastCheckpointBarrier(checkpointId, timestamp); //立即发出barrier，理由如上注释
            
            // now draw the state snapshot
            final StreamOperator<?>[] allOperators = operatorChain.getAllOperators();
            final StreamTaskState[] states = new StreamTaskState[allOperators.length];

            boolean hasAsyncStates = false;

            for (int i = 0; i < states.length; i++) { //根据各个state的类型，判断是否需要异步
                StreamOperator<?> operator = allOperators[i];
                if (operator != null) {
                    StreamTaskState state = operator.snapshotOperatorState(checkpointId, timestamp);
                    if (state.getOperatorState() instanceof AsynchronousStateHandle) {
                        hasAsyncStates = true;
                    }
                    if (state.getFunctionState() instanceof AsynchronousStateHandle) {
                        hasAsyncStates = true;
                    }
                    if (state.getKvStates() != null) {
                        for (KvStateSnapshot<?, ?, ?, ?, ?> kvSnapshot: state.getKvStates().values()) {
                            if (kvSnapshot instanceof AsynchronousKvStateSnapshot) {
                                hasAsyncStates = true;
                            }
                        }
                    }

                    states[i] = state.isEmpty() ? null : state;
                }
            }

            for (int i = 0; i < states.length; i++) { //为所有的Operator生成snapshot的StreamTaskState
                StreamOperator<?> operator = allOperators[i];
                if (operator != null) {
                    StreamTaskState state = operator.snapshotOperatorState(checkpointId, timestamp); //通过operator.snapshotOperatorState生成StreamTaskState
                    states[i] = state.isEmpty() ? null : state;
                }
            }

            StreamTaskStateList allStates = new StreamTaskStateList(states);
            
            
            //异步或同步的进行checkpoint
            if (allStates.isEmpty()) {
                getEnvironment().acknowledgeCheckpoint(checkpointId);
            } else if (!hasAsyncStates) { //sync方式
                this.lastCheckpointSize = allStates.getStateSize();
                getEnvironment().acknowledgeCheckpoint(checkpointId, allStates);
            } else { //async方式
                // start a Thread that does the asynchronous materialization and
                // then sends the checkpoint acknowledge
                String threadName = "Materialize checkpoint state " + checkpointId + " - " + getName();
                AsyncCheckpointThread checkpointThread = new AsyncCheckpointThread(
                        threadName, this, cancelables, states, checkpointId);

                synchronized (cancelables) {
                    cancelables.add(checkpointThread);
                }
                checkpointThread.start();
            }
            return true;
        } else {
            return false;
        }
    }
}

这里是对于source而言的checkpoint的调用逻辑，对于中间节点或sink，是要根据barrier情况，通过onEvent来触发triggerCheckpoint的

StreamTask.triggerCheckpoint最关键的步骤是，会对task中每个operator完成state snapshot
最终生成StreamTaskStateList allStates，保存所有的state的list

最终同步或异步的调用

getEnvironment().acknowledgeCheckpoint(checkpointId, allStates);

把state snapshot发送到Jobmanager去，后面就看看JobManager怎么处理的

同步的方式比较简单，但是一般都是需要异步的做snapshot的，

看看异步的AsyncCheckpointThread

AsyncCheckpointThread

@Override
public void run() {
    try {
        for (StreamTaskState state : states) {
            if (state != null) {
                if (state.getFunctionState() instanceof AsynchronousStateHandle) {
                    AsynchronousStateHandle<Serializable> asyncState = (AsynchronousStateHandle<Serializable>) state.getFunctionState();
                    state.setFunctionState(asyncState.materialize());
                }
                if (state.getOperatorState() instanceof AsynchronousStateHandle) {
                    AsynchronousStateHandle<?> asyncState = (AsynchronousStateHandle<?>) state.getOperatorState();
                    state.setOperatorState(asyncState.materialize());
                }
                if (state.getKvStates() != null) {
                    Set<String> keys = state.getKvStates().keySet();
                    HashMap<String, KvStateSnapshot<?, ?, ?, ?, ?>> kvStates = state.getKvStates();
                    for (String key: keys) {
                        if (kvStates.get(key) instanceof AsynchronousKvStateSnapshot) {
                            AsynchronousKvStateSnapshot<?, ?, ?, ?, ?> asyncHandle = (AsynchronousKvStateSnapshot<?, ?, ?, ?, ?>) kvStates.get(key);
                            kvStates.put(key, asyncHandle.materialize()); //可以看到把真正的存储，delay到这里的materialize去做
                        }
                    }
                }

            }
        }
        StreamTaskStateList allStates = new StreamTaskStateList(states);
        owner.lastCheckpointSize = allStates.getStateSize();
        owner.getEnvironment().acknowledgeCheckpoint(checkpointId, allStates);

        LOG.debug("Finished asynchronous checkpoints for checkpoint {} on task {}", checkpointId, getName());
    }

RuntimeEnvironment

package org.apache.flink.runtime.taskmanager;

/**
 * In implementation of the {@link Environment}.
 */
public class RuntimeEnvironment implements Environment {
    @Override
    public void acknowledgeCheckpoint(long checkpointId, StateHandle<?> state) {
        // try and create a serialized version of the state handle
        SerializedValue<StateHandle<?>> serializedState;
        long stateSize;

        if (state == null) {
            serializedState = null;
            stateSize = 0;
        } else {
            try {
                serializedState = new SerializedValue<StateHandle<?>>(state);
            } catch (Exception e) {
                throw new RuntimeException("Failed to serialize state handle during checkpoint confirmation", e);
            }

            try {
                stateSize = state.getStateSize();
            }
            catch (Exception e) {
                throw new RuntimeException("Failed to fetch state handle size", e);
            }
        }
        
        AcknowledgeCheckpoint message = new AcknowledgeCheckpoint(
                jobId,
                executionId,
                checkpointId,
                serializedState,
                stateSize);

        jobManager.tell(message);
    }
}

所以可以看到，是把这个ack发送到job manager的，

JobManager

handleCheckpointMessage

/**
* Dedicated handler for checkpoint messages.
*
* @param actorMessage The checkpoint actor message.
*/
private def handleCheckpointMessage(actorMessage: AbstractCheckpointMessage): Unit = {
actorMessage match {
  case ackMessage: AcknowledgeCheckpoint =>
    val jid = ackMessage.getJob()
    currentJobs.get(jid) match {
      case Some((graph, _)) =>
        val checkpointCoordinator = graph.getCheckpointCoordinator()
        val savepointCoordinator = graph.getSavepointCoordinator()

        if (checkpointCoordinator != null && savepointCoordinator != null) {
          future {  //future等待异步的ack消息
            try {
              if (checkpointCoordinator.receiveAcknowledgeMessage(ackMessage)) { //JobManager收到checkpoint的ack message
                // OK, this is the common case
              }
              else {
                // Try the savepoint coordinator if the message was not addressed
                // to the periodic checkpoint coordinator.
                if (!savepointCoordinator.receiveAcknowledgeMessage(ackMessage)) {
                  log.info("Received message for non-existing checkpoint " +
                    ackMessage.getCheckpointId)
                }
              }
            }
            catch {
              case t: Throwable =>
                log.error(s"Error in CheckpointCoordinator while processing $ackMessage", t)
            }
          }(context.dispatcher)
        }

CheckpointCoordinator

receiveAcknowledgeMessage

/**
 * Receives an AcknowledgeCheckpoint message and returns whether the
 * message was associated with a pending checkpoint.
 *
 * @param message Checkpoint ack from the task manager
 *
 * @return Flag indicating whether the ack'd checkpoint was associated
 * with a pending checkpoint.
 *
 * @throws Exception If the checkpoint cannot be added to the completed checkpoint store.
 */
public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception {

    final long checkpointId = message.getCheckpointId();

    CompletedCheckpoint completed = null;
    PendingCheckpoint checkpoint;

    // Flag indicating whether the ack message was for a known pending
    // checkpoint.
    boolean isPendingCheckpoint;

    synchronized (lock) {

        checkpoint = pendingCheckpoints.get(checkpointId); //取出相应的pendingCheckpoint

        if (checkpoint != null && !checkpoint.isDiscarded()) {
            isPendingCheckpoint = true;

            if (checkpoint.acknowledgeTask(message.getTaskExecutionId(), message.getState(), message.getStateSize())) { //根据这个ack message，对pendingCheckpoint进行ack
                if (checkpoint.isFullyAcknowledged()) { //如果所有需要ack的tasks都完成ack
                    completed = checkpoint.toCompletedCheckpoint(); //将状态置为Completed

                    completedCheckpointStore.addCheckpoint(completed); //将checkpoint track到completedCheckpointStore，表示完成一次完整的checkpoint

                    pendingCheckpoints.remove(checkpointId); //从pending里面去除相应的checkpoint
                    rememberRecentCheckpointId(checkpointId);

                    dropSubsumedCheckpoints(completed.getTimestamp());

                    onFullyAcknowledgedCheckpoint(completed);

                    triggerQueuedRequests();
                }
            }

        }
    }

    // send the confirmation messages to the necessary targets. we do this here
    // to be outside the lock scope
    if (completed != null) {
        final long timestamp = completed.getTimestamp();

        for (ExecutionVertex ev : tasksToCommitTo) {
            Execution ee = ev.getCurrentExecutionAttempt();
            if (ee != null) {
                ExecutionAttemptID attemptId = ee.getAttemptId();
                NotifyCheckpointComplete notifyMessage = new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp);
                ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId()); //通知每个ExecutionVertex，checkpoint完成
            }
        }

        statsTracker.onCompletedCheckpoint(completed);
    }

    return isPendingCheckpoint;
}

PendingCheckpoint

在acknowledgeTask中，

只是把state，cache在collectedStates中，

public boolean acknowledgeTask(
        ExecutionAttemptID attemptID,
        SerializedValue<StateHandle<?>> state,
        long stateSize) {

    synchronized (lock) {
        if (discarded) {
            return false;
        }
        
        ExecutionVertex vertex = notYetAcknowledgedTasks.remove(attemptID);
        if (vertex != null) {
            if (state != null) {
                collectedStates.add(new StateForTask(
                        state,
                        stateSize,
                        vertex.getJobvertexId(),
                        vertex.getParallelSubtaskIndex(),
                        System.currentTimeMillis() - checkpointTimestamp));
            }
            numAcknowledgedTasks++;
            return true;
        }
        else {
            return false;
        }
    }
}

接着在收到所有的task的ack后，会调用toCompletedCheckpoint

public CompletedCheckpoint toCompletedCheckpoint() {
    synchronized (lock) {
        if (discarded) {
            throw new IllegalStateException("pending checkpoint is discarded");
        }
        if (notYetAcknowledgedTasks.isEmpty()) {
            CompletedCheckpoint completed =  new CompletedCheckpoint(jobId, checkpointId,
                    checkpointTimestamp, System.currentTimeMillis(), new ArrayList<StateForTask>(collectedStates));
            dispose(null, false);
            
            return completed;
        }
        else {
            throw new IllegalStateException("Cannot complete checkpoint while not all tasks are acknowledged");
        }
    }
}

把collectedStates封装在CompletedCheckpoint中，返回

最后调用completedCheckpointStore.addCheckpoint，存储这个checkpoint，可以参考

ZooKeeperCompletedCheckpointStore

NotifyCheckpointComplete

通用这个NotifyCheckpointComplete，也最到TaskManager，Task，最终调到StreamTask.notifyCheckpointComplete

@Override
public void notifyCheckpointComplete(long checkpointId) throws Exception {
    synchronized (lock) {
        if (isRunning) {
            LOG.debug("Notification of complete checkpoint for task {}", getName());
            
            // We first notify the state backend if necessary
            if (stateBackend instanceof CheckpointNotifier) {
                ((CheckpointNotifier) stateBackend).notifyCheckpointComplete(checkpointId);
            }
            
            for (StreamOperator<?> operator : operatorChain.getAllOperators()) {
                if (operator != null) {
                    operator.notifyOfCompletedCheckpoint(checkpointId);
                }
            }
        }
        else {
            LOG.debug("Ignoring notification of complete checkpoint for not-running task {}", getName());
        }
    }
}

这个就是checkpoint的完整的过程

再看看restore的过程

Restore过程

可以看到，在提交job的时候，会调用

executionGraph.restoreLatestCheckpointedState()

/**
 * Restores the latest checkpointed state.
 *
 * <p>The recovery of checkpoints might block. Make sure that calls to this method don't
 * block the job manager actor and run asynchronously.
 * 
 */
public void restoreLatestCheckpointedState() throws Exception {
    synchronized (progressLock) {
        if (checkpointCoordinator != null) {
            checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false);
        }
    }
}

restoreLatestCheckpointedState

public void restoreLatestCheckpointedState(
        Map<JobVertexID, ExecutionJobVertex> tasks,
        boolean errorIfNoCheckpoint,
        boolean allOrNothingState) throws Exception {

    synchronized (lock) {

        // Recover the checkpoints
        //对于ZooKeeperCompletedCheckpointStore，
        //Gets the latest checkpoint from ZooKeeper and removes all others.
        completedCheckpointStore.recover();
        // restore from the latest checkpoint
        CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); //从completedCheckpointStore中取出最新的CompletedCheckpoint

        long recoveryTimestamp = System.currentTimeMillis();

        if (allOrNothingState) { //全部成功或Nothing
            Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>();

            for (StateForTask state : latest.getStates()) {
                ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
                Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
                exec.setInitialState(state.getState(), recoveryTimestamp); //恢复state

                Integer count = stateCounts.get(vertex); //计数
                if (count != null) {
                    stateCounts.put(vertex, count+1);
                } else {
                    stateCounts.put(vertex, 1);
                }
            }

            // validate that either all task vertices have state, or none
            for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) {
                ExecutionJobVertex vertex = entry.getKey();
                if (entry.getValue() != vertex.getParallelism()) { //如果vetex的恢复state次数不等于平行数，说明有些没有被恢复，抛异常
                    throw new IllegalStateException(
                            "The checkpoint contained state only for a subset of tasks for vertex " + vertex);
                }
            }
        }
        else {
            for (StateForTask state : latest.getStates()) {
                ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
                Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
                exec.setInitialState(state.getState(), recoveryTimestamp);
            }
        }
    }
}

Execution

public void setInitialState(SerializedValue<StateHandle<?>> initialState, long recoveryTimestamp) {
    if (state != ExecutionState.CREATED) {
        throw new IllegalArgumentException("Can only assign operator state when execution attempt is in CREATED");
    }
    this.operatorState = initialState;
    this.recoveryTimestamp = recoveryTimestamp;
}

可以看到这里的recovery，只是把我们从zk中获取的checkpoint中的状态赋值给operatorState

然后再deployToSlot，会把初始state，封装到deployment中去，提交给taskManager

public void deployToSlot(final SimpleSlot slot) throws JobException {
    final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(attemptId, slot, operatorState, recoveryTimestamp, attemptNumber);
    final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout);
}

在TaskManager中的submitTask里面，会创建Task，并执行该task，

Task.run()

// the very last thing before the actual execution starts running is to inject
// the state into the task. the state is non-empty if this is an execution
// of a task that failed but had backuped state from a checkpoint

// get our private reference onto the stack (be safe against concurrent changes) 
SerializedValue<StateHandle<?>> operatorState = this.operatorState; //恢复的state
long recoveryTs = this.recoveryTs;

if (operatorState != null) {
    if (invokable instanceof StatefulTask) { //如果是一个有状态的task
        try {
            StateHandle<?> state = operatorState.deserializeValue(userCodeClassLoader); //反序列化数据
            StatefulTask<?> op = (StatefulTask<?>) invokable;
            StateUtils.setOperatorState(op, state, recoveryTs);//真正的恢复state
        }
        catch (Exception e) {
            throw new RuntimeException("Failed to deserialize state handle and setup initial operator state.", e);
        }
    }
    else {
        throw new IllegalStateException("Found operator state for a non-stateful task invokable");
    }
}

// be memory and GC friendly - since the code stays in invoke() for a potentially long time,
// we clear the reference to the state handle
//noinspection UnusedAssignment
operatorState = null;
this.operatorState = null;

StateUtils

public static <T extends StateHandle<?>> void setOperatorState(StatefulTask<?> op,
        StateHandle<?> state, long recoveryTimestamp) throws Exception {
    @SuppressWarnings("unchecked")
    StatefulTask<T> typedOp = (StatefulTask<T>) op;
    @SuppressWarnings("unchecked")
    T typedHandle = (T) state;

    typedOp.setInitialState(typedHandle, recoveryTimestamp);
}

StreamTask

@Override
public void setInitialState(StreamTaskStateList initialState, long recoveryTimestamp) {
    lazyRestoreState = initialState; //将状态置到lazyRestoreState
    this.recoveryTimestamp = recoveryTimestamp;
}

//在StreamTask的invoke中，会调用restoreStateLazy，真正的做状态恢复

public void restoreStateLazy() throws Exception {
    if (lazyRestoreState != null) {
        
        try {
            final StreamOperator<?>[] allOperators = operatorChain.getAllOperators();
            final StreamTaskState[] states = lazyRestoreState.getState(userClassLoader); //获取所有states
            
            // be GC friendly
            lazyRestoreState = null;
            
            for (int i = 0; i < states.length; i++) {
                StreamTaskState state = states[i];
                StreamOperator<?> operator = allOperators[i];
                
                if (state != null && operator != null) {
                    operator.restoreState(state, recoveryTimestamp); //最终把state恢复到operator
                }
                else if (operator != null) {
                
                }
            }
        }
        catch (Exception e) {
            throw new Exception("Could not restore checkpointed state to operators and functions", e);
        }
    }
}

posted on 2016-11-19 00:11 fxjwind 阅读(4943) 评论(0) 收藏举报

刷新页面返回顶部

fxjwind