混合慢启动Hybrid Slow Start

rfc文档 Taming the elephants: New TCP slow start

 标准的慢启动在长肥管道--带宽延迟乘积(BDP)较大网络环境下表现不好,不好的原因主要有两个:

    1) 标准慢启动的拥塞窗口指数式的增长方式过于激进容易导致大量丢包,丢包恢复性能损耗太大。

    2) 被优化过的慢启动机制,丢包时在数据包重传恢复的时候碰巧试图去减小服务器的负载,导致数据包恢复慢。

    总结这些原因都是因为慢启动过程过于盲目,不能及时的预测拥塞,导致了大量丢包,所以混合慢启动机制的主要作用是在慢启动阶段试图找到“合理 ”时机 退出慢启动 进入拥塞避免状态点(“safe” exit point )
慢启动是怎么找到“safe” exit point   

在Hybrid Slow Start算法中给出了种类判断机制用来退出慢启动进入拥塞避免,分别是ACKs train length和Increase in packet delays。

  •  通过ACK train     ACK Train的长度为在一个RTT周期内,紧密相邻的ACK报文到达的时间间隔之和,内核默认间隔不大于2ms的一系列ACK报文为ACK Train。 每个RTT周期,计算一次ACK Train长度,与估算的最小路径发送延迟进行对比 
    The ACK train length is measured by calculating the sum of inter-arrival times of all the closely spaced ACKs within an RTT round. 
        、The train length is strongly affected by the bottleneck bandwidth, routing delays and buffer sizes along the path, 
    and is easily stretched out by congestion caused by cross traffic in the path, 
    so by estimating the train length we can reliably find a safe exit point of Slow Start.
    
  •  通过RTT delay   Increase in packet delays的测量会受到bursty transmission的影响,所以只测一个RTT中刚开始的几个数据包的往返时间来避免bursty transission的影响
    Increase in packet delays during Slow Start may indicate the possibility of the bottleneck router being congested.
    

混合慢启动(Hybrid Slow Start)目前在cubic拥塞控制算法中引入

数据结构:

#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
					 * max_cwnd = snd_cwnd * beta
					 */
#define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */

/* Two methods of hybrid slow start */
#define HYSTART_ACK_TRAIN	0x1//进入拥塞避免的方法1
#define HYSTART_DELAY		0x2//进入拥塞避免的方法2

/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES	8//表示至少取一个RTT的前8个ACK作为样本
#define HYSTART_DELAY_MIN	(4000U)	/* 4 ms */
#define HYSTART_DELAY_MAX	(16000U)	/* 16 ms */
#define HYSTART_DELAY_THRESH(x)	clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)

static int fast_convergence __read_mostly = 1;
static int beta __read_mostly = 717;	/* = 717/1024 (BICTCP_BETA_SCALE) */
static int initial_ssthresh __read_mostly;
static int bic_scale __read_mostly = 41;
static int tcp_friendliness __read_mostly = 1;

static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
static int hystart_ack_delta_us __read_mostly = 2000;
/* BIC TCP Parameters */
struct bictcp {
	u32	cnt;		/* increase cwnd by 1 after ACKs */
	u32	last_max_cwnd;	/* last maximum snd_cwnd */
	u32	last_cwnd;	/* the last snd_cwnd */
	u32	last_time;	/* time when updated last_cwnd */
	u32	bic_origin_point;/* origin point of bic function */
	u32	bic_K;		/* time to origin point
				   from the beginning of the current epoch */
	u32	delay_min;	/* min delay (usec) 全局最小rtt */
	u32	epoch_start;	/* beginning of an epoch 记录慢启动的起始时间*/
	u32	ack_cnt;	/* number of acks */
	u32	tcp_cwnd;	/* estimated tcp cwnd */
	u16	unused;
	u8	sample_cnt;	/* number of samples to decide curr_rtt */
	u8	found;		/* the exit point is found? */
	u32	round_start;	/* beginning of each round */
	u32	end_seq;	/* end_seq of the round */
	u32	last_ack;	/* last time when the ACK spacing is close */
	u32	curr_rtt;	/* the minimum rtt of current round 记录样本中的最小rtt*/
};

混合慢启动位于内核的Cubic拥塞算法模块中,默认情况下此功能是开启状态(hystart=1),同时采用以上接收的ACK Train和报文延时进行检查(hystart_detect)。混合慢启动的最低拥塞窗口限定为16(hystart_low_window)

默认组成ACK-Train的报文间隔为2毫秒(hystart_ack_delta)。

函数实现

1、在Cubic的初始化函数bictcp_init中,如果启用了Hystart(hystart为真),初始化Hystart相关参数。

2、其次,当套接口的进入TCP_CA_Loss拥塞状态时,表明之后要开始SlowStart恢复阶段,调用bictcp_hystart_reset函数重设hystart参数,开启新一轮Hystart检测。注意这里也会调用bictcp_reset,其中,将found设置为0,表明还未发现SlowStart的退出点。

static void bictcp_state(struct sock *sk, u8 new_state)
{
	if (new_state == TCP_CA_Loss) {
		bictcp_reset(inet_csk_ca(sk));
		bictcp_hystart_reset(sk);
	}
}
static inline void bictcp_hystart_reset(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct bictcp *ca = inet_csk_ca(sk);
    
    ca->round_start = ca->last_ack = bictcp_clock(); //记录慢启动的开始时间
    ca->end_seq = tp->snd_nxt;
    ca->curr_rtt = 0;   //重置样本最小rtt为0
    ca->sample_cnt = 0; //重置样本计数为0
}

3、在SlowStart阶段,ACK确认序号在hystart的结束序号(end_seq)之后,表明当前的检测已经完成,开始新一轮的Hystart检测,重设hystart参数。如下bictcp_cong_avoid函数。

static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bictcp *ca = inet_csk_ca(sk);

	if (!tcp_is_cwnd_limited(sk))
		return;

	if (tcp_in_slow_start(tp)) {
		if (hystart && after(ack, ca->end_seq))
			bictcp_hystart_reset(sk);
		acked = tcp_slow_start(tp, acked);
		if (!acked)
			return;
	}
	bictcp_update(ca, tp->snd_cwnd, acked);
	tcp_cong_avoid_ai(tp, ca->cnt, acked);
}

Hybrid Slow Start实现的核心部分

Hystart检测
在tcp_ack处理ACK报文的过程中,将根据ACK报文确认的数据情况,清理重传队列,之后调用拥塞控制的pkts_acked函数指针,对于Cubic而言,其为bictcp_acked。目前,Cubic和Hystart仅使用到了ack_sample结构中的rtt_us字段。

static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
                   u32 prior_snd_una, struct tcp_sacktag_state *sack)
{
    ...
    if (icsk->icsk_ca_ops->pkts_acked) {
        struct ack_sample sample = { .pkts_acked = pkts_acked,
                         .rtt_us = sack->rate->rtt_us,
                         .in_flight = last_in_flight };

        icsk->icsk_ca_ops->pkts_acked(sk, &sample);
    }

bictcp_acked,变量delay_min保存最小的RTT值。在拥塞窗口大于hystart定义的最低窗口值16(hystart_low_window)时,hystart才开始执行。

static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct bictcp *ca = inet_csk_ca(sk);
	u32 delay;

	/* Some calls are for duplicates without timetamps */
	if (sample->rtt_us < 0)
		return;

	/* Discard delay samples right after fast recovery */
	if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
		return;

	delay = sample->rtt_us;
	if (delay == 0)
		delay = 1;

	/* first time call or link delay decreases */
	if (ca->delay_min == 0 || ca->delay_min > delay)
		ca->delay_min = delay;

	/* hystart triggers when cwnd is larger than some threshold */
	if (!ca->found && tcp_in_slow_start(tp) && hystart &&
	    tp->snd_cwnd >= hystart_low_window)
		hystart_update(sk, delay);
}
static void hystart_update(struct sock *sk, u32 delay)//delay==sample->rtt_us
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bictcp *ca = inet_csk_ca(sk);
	u32 threshold;

	if (hystart_detect & HYSTART_ACK_TRAIN) {
		u32 now = bictcp_clock_us(sk);//当前ACK报文时间戳(此函数在tcp_ack中调用,所以now相当于ACK报文的时间戳)

		/* first detection parameter - ack-train detection */
	/* 前后到来的两个ACK的间隔时间小于hystart_ack_delta才有效 */
		if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
			ca->last_ack = now; //更新上一个ACK到来的时间

			threshold = ca->delay_min + hystart_ack_delay(sk);

			/* Hystart ack train triggers if we get ack past
			 * ca->delay_min/2.
			 * Pacing might have delayed packets up to RTT/2
			 * during slow start.  
			 什么意思???
              内核定义的ACK-Train是在一个RTT周期内,间隔小于2ms的ACK报文。
			 所以ACK-Train永远小于当前的RTT时长,当Train长度大于最小RTT时,
			 说明网络拥塞
			 */
			if (sk->sk_pacing_status == SK_PACING_NONE)
				threshold >>= 1;
				//当前时间减去此轮ACK-Train测量开始时间戳round_start  大于 threshold== ca->delay_min/2.
			if ((s32)(now - ca->round_start) > threshold) {
				ca->found = 1;
				pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
					 now - ca->round_start, threshold,
					 ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
				NET_INC_STATS(sock_net(sk),
					      LINUX_MIB_TCPHYSTARTTRAINDETECT);
				NET_ADD_STATS(sock_net(sk),
					      LINUX_MIB_TCPHYSTARTTRAINCWND,
					      tp->snd_cwnd);
				tp->snd_ssthresh = tp->snd_cwnd;
			}
		}
	}

	if (hystart_detect & HYSTART_DELAY) {
		/* obtain the minimum delay of more than sampling packets */
		if (ca->curr_rtt > delay)
			ca->curr_rtt = delay;
		 /* 如果样本计数小于HYSTART_MIN_SAMPLES(默认为8) */
		if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
			ca->sample_cnt++;
		} else {
		//如果样本大于8了,那么就可以判断是否要进入拥塞避免了
            /* 如果前面8个样本中的最小rtt大于全局最小rtt与阈值的和,那么表示网络出
             * 现了拥塞,应立马进入拥塞避免阶段,
             采样到的curr_rtt值大于最小的RTT值加上1/8被的最小RTT值,
             即当curr_rtt的值大于9/8倍的最小RTT(delay_min)时,认为延时增加过大,
             退出SlowStart,将当前拥塞窗口值设置为ssthresh。*/
			if (ca->curr_rtt > ca->delay_min +
			    HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
				ca->found = 1;
				NET_INC_STATS(sock_net(sk),
					      LINUX_MIB_TCPHYSTARTDELAYDETECT);
				NET_ADD_STATS(sock_net(sk),
					      LINUX_MIB_TCPHYSTARTDELAYCWND,
					      tp->snd_cwnd);
				  /* 进入拥塞避免 shezhi snd_cwnd*/
				tp->snd_ssthresh = tp->snd_cwnd;
			}
		}
	}
}

Hybrid Slow Start for High-Bandwidth and Long-Distance Networks。 

 

posted @ 2024-02-22 21:24  codestacklinuxer  阅读(18)  评论(0编辑  收藏  举报