TCP拥塞控制算法内核实现剖析(一)
主要源文件:linux-2.6.37/ net/ ipv4/ Tcp_cong.c
本文主要分析RENO及TCP拥塞控制基础的实现
======================================================================================================
struct sock *sk 和 struct tcp_sock *tp 的转换
在include/ linux/ Tcp.h中,
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk ;
}
给出struct sock *sk,
struct tcp_sock *tp = tcp_sk(sk) ;
tcp_sock结构
struct tcp_sock
{
...
u32 window_clamp ; /* Maximal window to advertise */
u32 rcv_ssthresh ; /* Current window clamp */
u32 rcv_wnd ; /* Current receiver window */
...
/* snd_wll 记录发送窗口更新时,造成窗口更新的那个数据报的第一个序号。
* 它主要用于在下一次判断是否需要更新发送窗口。
*/
u32 snd_wll ; /* Sequence for window update */
u32 snd_wnd ; /* 发送窗口的大小,直接取值于来自对方的数据报的TCP首部 */
/* Maximal window ever seen from peer 记录来自对方通告的窗口的最大值 */
/* First byte we want an ack for 发送窗口的左边沿 */
u32 max_window ; u32 snd_una ;
...
/*
* Slow start and congestion control
*/
u32 snd_ssthresh ; /* Slow start size threshold */
u32 snd_cwnd ; /* Sending congestion window */
/*表示在当前的拥塞控制窗口中已经发送的数据段的个数*/
u32 snd_cwnd_cnt ; /* Linear increase counter */
u32 snd_cwnd_clamp ; /* Do not allow snd_cwnd to grow above this */
...
u32 mss_cache ; /* cached effective mss , not including SACKS */
u32 bytes_acked ; /* Appropriate Byte Counting - RFC3465 */
...
}
拥塞避免算法关键部分
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd ( or alternative w ) */
void tcp_cong_avoid_ai(struct tcp_sock *tp , u32 w)
{
if ( tp->snd_cwnd_cnt >= w) {
if ( tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++ ;
tp->snd_cwnd_cnt = 0 ;
} else {
tp->snd_cwnd_cnt ++ ;
}
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai) ;
慢启动算法
void tcp_slow_start( struct tcp_sock *tp )
{
int cnt ; /* increase in packets */
/* RFC3465 : ABC slow start
* Increase only after a full MSS of bytes is acked
*
* TCP sender SHOULD increase cwnd by the number of
* previously unacknowledged bytes ACKed by each incoming
* acknowledgment , provided the increase is not more than L
*/
/* ack的数据少于MSS */
if ( sysctl_tcp_abc && tp->bytes_acked < tp->mss_cached )
return ;
/* 此时不是应该进入拥塞避免?*/
if ( sysctl_tcp_max_ssthresh >0 && tcp->snd_cwnd >sysctl_tcp_max_ssthresh)
cnt = sysctl_tcp_max_ssthresh >> 1 ; /* limited slow start */
else
cnt = tp->snd_cwnd ; /* exponential increase */
/* RFC3465 : ABC
* We MAY increase by 2 if discovered delayed ack
*/
/* 如果接收方启用了延时确认,此时收到的确认代表两个MSS数据报*/
if ( sysctl_tcp_abc >1 && tp->bytes_acked >= 2*tp->mss_cache )
cnt <<= 1 ;
tp->bytes_acked = 0 ;
tp->snd_cwnd_cnt += cnt ; /* 此时snd_cwnd_cnt等于snd_cwnd或2*snd_cwnd */
while( tp->snd_cwnd_cnt >= tp->snd_cwnd ) {
tp->snd_cwnd_cnt -= tp->snd_cwnd ;
if( tp->snd_cwnd < tp->snd_cwnd_clamp )
tp->snd_cwnd++ ;
}
}
EXPORT_SYMBOL_GPL( tcp_slow_start ) ;
代表拥塞算法的结构体
#define TCP_CA_NAME_MAX 16struct tcp_congestion_ops {
struct list_head list ;
unsigned long flags ;
/* initialize private data (optional) */
void (*init) (struct sock *sk) ;
/* cleanup private data (optional) */
void (*release) (struct sock *sk) ;
/* return slow start threshold (required) */
u32 (*ssthresh) (struct sock *sk) ;
/* lower bound for congestion window (optional) */
u32 (*min_cwnd) (const struct sock *sk) ;
/* do new cwnd calculation (required) */
void (*cong_avoid) (struct sock *sk , u32 ack , u32 in_flight ) ;
/* call before changing ca_state (optional) */
void (*set_state) (struct sock *sk , u8 new_state) ;
/* call when cwnd event occurs (optional) */
void (*cwnd_event) (struct sock *sk , enum tcp_ca_event ev) ;
/* new value of cwnd after loss (optional) */
u32 (*undo_cwnd) (struct sock *sk) ;
/* hook for packet ack accounting (optional) */
void (*pkts_acked) (struct sock *sk , u32 num_acked , s32 rtt_us) ;
/* get info for inet_diag (optional) */
void (*get_info) (struct sock *sk , u32 ext , struct sk_buff *skb) ;
char name[TCP_CA_NAME_MAX] ;
struct module *owner ;
}
在Tcp_cong.c中,有全局变量:
int sysctl_tcp_max_ssthresh = 0 ;
/* define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) */
static DEFINE_SPINLOCK( tcp_cong_list_lock ) ;
static LIST_HEAD( tcp_cong_list ) ; // tcp拥塞控制算法链表,其元素为tcp_congestion_ops
/*
BUG_ON( ) ; 如果BUG_ON中的条件为真就调用BUG,它输出一些信息,然后调用panic函数挂起系统。
char *strncpy( char * dest , char *src , size_t n ) ;
它与strcpy不同之处在于复制n个字符,而不是把所有的字符拷贝(包括结尾'\0')。
当src的长度小于n时,dst内的未复制空间用'\0'填充。否则,复制n个字符到dst,没有加'\0'。这里就要注意在字符串dst结尾处理加'\0'的情况了。
rcu_read_lock() // 读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区。
rcu_read_unlock() // 该函数与rcu_read_lock配对使用,用以标记读者退出读端临界区。
*/
对拥塞控制算法的一些操作(读写增减注册等)
/* Get current default congestion control */
void tcp_get_default_congestion_control( char *name )
{
struct tcp_congestion_ops *ca ;
/* We will always have reno */
BUG_ON( list_empty( &tcp_cong_list) ) ;
rcu_read_lock( ) ;
ca = list_entry( tcp_cong_list . next , struct tcp_congestion_ops , list ) ;
strncpy( name , ca->name , TCP_CA_NAME_MAX ) ;
rcu_read_unlock( ) ;
}
struct sock——representation of sockets
struct inet_sock——representation of INET sockets
struct inet_connection_sock——INET connection oriented sockets
struct tcp_sock——tcp sockets
以上几种socket越分越细,比如inet_connection_sock是在inet_sock上的扩展,具有自己特有的属性。
tcp_sock是TCP协议专用的一个socket表示,它是在struct inet_connection_sock基础进行扩展,主要是增加了滑动窗口协议,避免拥塞算法等一些TCP专有属性。
struct inet_connection_sock {
...
// Pluggable congestion control hook
const struct tcp_congestion_ops *icsk_ca_ops ;
...
u32 icsk_ca_priv[16] ;
#define ICSK_CA_PRIV_SIZE (16*sizeof(u32))
}
举例://有一个初始化了得struct sock *sk
struct inet_connection_sock *icsk = inet_csk( sk ) ;
printk(KERN_INFO "%s" , icsk->icsk_ca_ops->name) ; //当前连接拥塞控制算法名称
struct inet_sock {
...
/* Socket demultiplex comparisons on incoming packets */
__be32 inet_daddr ;
__be16 inet_dport ;
__be32 inet_saddr ;
__be16 inet_sport ;
__be16 inet_num ; // local port
__be32 inet_rcv_saddr ; // Bound local IPv4 addr
...
}
/* Built list of non-restricted congestion control values*/
void tcp_get_allowed_congestion_control( char *buf , size_t maxlen)
{
struct tcp_congestion_ops *ca ;
size_t offs = 0 ;
*buf = '\0' ; //有必要?
rcu_read_lock() ;
list_for_each_entry( ca , &tcp_cong_list , list ) {
if( !( ca->flags & TCP_CONG_NON_RESTRICTED)) //排除有限制的。限制和非限制区别?
continue;
offs += snprintf( buf+offs , maxlen-offs , "%s%s" , offs == 0?"" : " " , ca->name) ;
}
rcu_read_unlock() ;
}
/* Simple linear search , don't expect many entries! */
static struct tcp_congestion_ops*tcp_ca_find( const char *name)
{
struct tcp_congestion_ops *e ;
list_for_each_entry_rcu( e , &tcp_cong_list , list ) {
if( strcmp(e->name , name)==0)
return e ;
}
return NULL ;
}
/*
* Attach new congestion control algorithm to the list
* of available options.
*/
int tcp_register_congestion_control( struct tcp_congestion_ops *ca )
{
int ret = 0 ;
/* all algorithms must implement ssthresh and cong_avoid ops */
if ( !ca->ssthresh || !ca->cong_avoid ) {
printk(KERN_ERR "TCP %s does not implement required ops\n",
ca->name) ;
return -EINVAL ;
}
spin_lock(&tcp_cong_list_lock) ;
if( tcp_ca_find (ca->name)) {
printk(KERN_NOTICE "TCP %s already registered\n", ca->name) ;
ret = -EEXIST; //不能直接return,不然会造成死锁
} else {
list_add_tail_rcu( &ca->list , &tcp_cong_list) ;
printk(KERN_INFO "TCP %s registered\n", ca->name) ;
}
spin_unlock(&tcp_cong_list_lock) ;
return ret ;
}
======================================================================================================
浙公网安备 33010602011771号