DPVS学习笔记:synproxy实现

DPVS学习笔记:synproxy实现

三次握手示意图:

img

如上图所示,syn proxy 正常请求有四个阶段:

  1. client 发送 syn, LB 代理了第一次握手,不转发给 rs. LB 返回 ack 包时,seq 由 syn cookie 算法生成,并且将 win 设置为 0,不允许在握手阶段携带数据。由此得知不支持 tcp fast open
  2. 当 client 返回 ack 时,反解 seq, 如果和 cookie 算法匹配,那么就是正常流量。此时 LB 与后端 rs 开启三次所握手,并透传 win size. 由于经过 LB 代理,还需要记录 seq 差值 delta
  3. 数据交互通信,lb 除了正常的 full-nat 工作,还要补偿 seq delta
  4. 连接关闭,正常清理

6次握手的报文处理流程:

1、client与synproxy的第一次握手:client ——> synproxy SYN报文

__dp_vs_pre_routing -->dp_vs_synproxy_syn_rcv

__dp_vs_pre_routing

static int __dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf,
                    const struct inet_hook_state *state, int af)
{
   ...

    /* Synproxy: defence synflood */
	//如果传输层协议是tcp,那么处理client端第一次握手
    if (IPPROTO_TCP == iph.proto) {
        int v = INET_ACCEPT;
        if (0 == dp_vs_synproxy_syn_rcv(af, mbuf, &iph, &v))
            return v;
    }

    return INET_ACCEPT;
}

dp_vs_synproxy_syn_rcv

int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf,
        const struct dp_vs_iphdr *iph, int *verdict)
{
    int ret;
    struct dp_vs_service *svc = NULL;
    struct tcphdr *th, _tcph;
    struct dp_vs_synproxy_opt tcp_opt;
    struct netif_port *dev;
    struct rte_ether_hdr *eth;
    struct rte_ether_addr ethaddr;

	//th指向tcp首部
    th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
    if (unlikely(NULL == th))
        goto syn_rcv_out;

	//如果第一次握手只有syn包,并且有访问svc,开启了synproxy
    if (th->syn && !th->ack && !th->rst && !th->fin &&
            (svc = dp_vs_service_lookup(af, iph->proto, &iph->daddr, th->dest, 0,
                NULL, NULL, rte_lcore_id())) && (svc->flags & DP_VS_SVC_F_SYNPROXY)) {
        /* if service's weight is zero (non-active realserver),
         * do noting and drop the packet */
        //如果后端svc的权重为0,没有可用后端,返回INET_DROP
        if (svc->weight == 0) {
            dp_vs_estats_inc(SYNPROXY_NO_DEST);
            goto syn_rcv_out;
        }

        /* drop packet from blacklist */
		//如果在黑名单中,那么返回INET_DROP
        if (dp_vs_blklst_lookup(iph->af, iph->proto, &iph->daddr,
                    th->dest, &iph->saddr)) {
            goto syn_rcv_out;
        }

        /* drop packet if not in whitelist */
		//如果不在白名单中,那么返回INET_DROP
        if (!dp_vs_whtlst_allow(iph->af, iph->proto, &iph->daddr, th->dest, &iph->saddr)) {
            goto syn_rcv_out;
        }
    } else {
        return 1;
    }

    /* mbuf will be reused and ether header will be set.
     * FIXME: to support non-ether packets. */
    if (mbuf->l2_len != sizeof(struct rte_ether_hdr))
        goto syn_rcv_out;

    /* update statistics */
	//更新统计信息
    dp_vs_estats_inc(SYNPROXY_SYN_CNT);

    /* set tx offload flags */
	//校验
    assert(mbuf->port <= NETIF_MAX_PORTS);
	//获取net_device层设备,并做校验
    dev = netif_port_get(mbuf->port);
    if (unlikely(!dev)) {
        RTE_LOG(ERR, IPVS, "%s: device eth%d not found\n",
                __func__, mbuf->port);
        goto syn_rcv_out;
    }
	//根据物理设备的硬件负载功能,设置mbuf相应标志位
    if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
        if (af == AF_INET)
            mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
        else
            mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6);
    }

    ...

syn_rcv_out:
    /* drop and destroy the packet */
    *verdict = INET_DROP;
    return 0;
}

2、client与synproxy的第二次握手:synproxy ——> client SYNACK报文

dp_vs_synproxy_syn_rcv通过对第一个握手包syn进行调整和参数设置,返回syn+ack

dp_vs_synproxy_syn_rcv

int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf,
        const struct dp_vs_iphdr *iph, int *verdict)
{
    ...

    /* reuse mbuf */
	//复用mbuf并回复syn+ack;对mbuf修改后,直接当做syn+ack回复返回给了client
    syn_proxy_reuse_mbuf(af, mbuf, th, &tcp_opt);

    /* set L2 header and send the packet out
     * It is noted that "ipv4_xmit" should not used here,
     * because mbuf is reused. */
    //设置L2层的header
    eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len);
    if (unlikely(!eth)) {
        RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__);
        goto syn_rcv_out;
    }
	//交换原来mbuf的目的地址和源地址
    memcpy(&ethaddr, &eth->s_addr, sizeof(struct rte_ether_addr));
    memcpy(&eth->s_addr, &eth->d_addr, sizeof(struct rte_ether_addr));
    memcpy(&eth->d_addr, &ethaddr, sizeof(struct rte_ether_addr));

	//调用netif_xmit发送数据包返回给client
    if (unlikely(EDPVS_OK != (ret = netif_xmit(mbuf, dev)))) {
        RTE_LOG(ERR, IPVS, "%s: netif_xmit failed -- %s\n",
                __func__, dpvs_strerror(ret));
    /* should not set verdict to INET_DROP since netif_xmit
     * always consume the mbuf while INET_DROP means mbuf'll
     * be free in INET_HOOK.*/
    }

    *verdict = INET_STOLEN;
    return 0;

syn_rcv_out:
    /* drop and destroy the packet */
    *verdict = INET_DROP;
    return 0;
}

syn_proxy_reuse_mbuf

static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf,
                                 struct tcphdr *th,
                                 struct dp_vs_synproxy_opt *opt)
{
    uint32_t isn;
    uint16_t tmpport;
    int iphlen;

	//获取ip首部长度
    if (AF_INET6 == af)
        iphlen = sizeof(struct ip6_hdr);
    else
        iphlen = ip4_hdrlen(mbuf);

	//长度校验,确保首部长度正确
    if (mbuf_may_pull(mbuf, iphlen + (th->doff << 2)) != 0)
        return;

    /* deal with tcp options */
	//解析并且设置tcp options,包括mss,window size,timestamp
    syn_proxy_parse_set_opts(mbuf, th, opt);

    /* get cookie */
	//根据syn cookies算法生成cookie
    if (AF_INET6 == af)
        isn = syn_proxy_cookie_v6_init_sequence(mbuf, th, opt);
    else
        isn = syn_proxy_cookie_v4_init_sequence(mbuf, th, opt);

    /* set syn-ack flag */
	//设置syn|ack标志
    ((uint8_t *)th)[13] = 0x12;

    /* exchage ports */
	//交换dest、source端口
    tmpport = th->dest;
    th->dest = th->source;
    th->source = tmpport;
    /* set window size to zero if enabled */
	//设置接收端口为0,不允许握手阶段携带数据信息
    if (dp_vs_synproxy_ctrl_clwnd && !dp_vs_synproxy_ctrl_defer)
        th->window = 0;
    /* set seq(cookie) and ack_seq */
	//设置seq和ack_seq,其中ack_seq是客户端序号加1,而返回的seq是刚计算出的syn cookie
    th->ack_seq = htonl(ntohl(th->seq) + 1);
    th->seq = htonl(isn);

    /* exchage addresses */
	//交换源和目的ip地址信息,并重新计算校验和
    if (AF_INET6 == af) {
        struct in6_addr tmpaddr;
        struct ip6_hdr *ip6h = ip6_hdr(mbuf);

        tmpaddr = ip6h->ip6_src;
        ip6h->ip6_src = ip6h->ip6_dst;
        ip6h->ip6_dst = tmpaddr;
        ip6h->ip6_hlim = dp_vs_synproxy_ctrl_synack_ttl;

        if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) {
            mbuf->l3_len = (void *)th - (void *)ip6h;
            mbuf->l4_len = (th->doff << 2);
            th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, mbuf->l3_len, IPPROTO_TCP);
        } else {
            if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                return;
            tcp6_send_csum((struct rte_ipv6_hdr*)ip6h, th);
        }
    } else {
        uint32_t tmpaddr;
        struct iphdr *iph = (struct iphdr*)ip4_hdr(mbuf);

        tmpaddr = iph->saddr;
        iph->saddr = iph->daddr;
        iph->daddr = tmpaddr;
        iph->ttl = dp_vs_synproxy_ctrl_synack_ttl;
        iph->tos = 0;

        /* compute checksum */
        if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) {
            mbuf->l3_len = iphlen;
            mbuf->l4_len = (th->doff << 2);
            th->check = rte_ipv4_phdr_cksum((struct rte_ipv4_hdr*)iph, mbuf->ol_flags);
        } else {
            if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
                return;
            tcp4_send_csum((struct rte_ipv4_hdr*)iph, th);
        }

		//如果硬件不支持计算csum,调用ip4_send_csum生成csum
        if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM))
            iph->check = 0;
        else
            ip4_send_csum((struct rte_ipv4_hdr*)iph);
    }
}

3、client与synproxy的第三次握手:client ——> synproxy ACK报文

__dp_vs_in --> tcp_conn_sched --> dp_vs_synproxy_ack_rcv

HOOK 会执行 INET_HOOK_PRE_ROUTING 注册的回调函数:dp_vs_pre_routingdp_vs_in,当 client 返回 ack 应答时,dp_vs_pre_routing 返回 DPVS_ACCEPT, 继续执行 dp_vs_in 逻辑。

__dp_vs_in

static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
    ...

    /* packet belongs to existing connection ? */
	//在流表中查找连接
    conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid);

    if (unlikely(drop)) {
        RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.\n", __func__);
        return INET_DROP;
    }

    /*
     * The connection is not locally found, however the redirect is found so
     * forward the packet to the remote redirect owner core.
     */
    if (cid != peer_cid) {
        /* recover mbuf.data_off to outer Ether header */
        rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr));

        return dp_vs_redirect_pkt(mbuf, peer_cid);
    }

    if (unlikely(!conn)) {
        /* try schedule RS and create new connection */
		//如果没有找到会话,conn_sched根据请求选择一个后端rs建立连接
        if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
            /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
            return verdict;
        }

        /* only SNAT triggers connection by inside-outside traffic. */
		//snat模式,则是内部服务器访问外部服务,内网服务器-->dpvs-->外网服务器,所以设置dir = DPVS_CONN_DIR_OUTBOUND
        if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
            dir = DPVS_CONN_DIR_OUTBOUND;
        else
            dir = DPVS_CONN_DIR_INBOUND;
    } else {
        /* assert(conn->dest != NULL); */
        if (prot->conn_expire_quiescent && (conn->flags & DPVS_CONN_F_EXPIRE_QUIESCENT) &&
                conn->dest && (!dp_vs_dest_is_avail(conn->dest) ||
                    rte_atomic16_read(&conn->dest->weight) == 0)) {
            RTE_LOG(INFO, IPVS, "%s: the conn is quiescent, expire it right now,"
                    " and drop the packet!\n", __func__);
            prot->conn_expire_quiescent(conn);
            dp_vs_conn_put(conn);
            return INET_DROP;
        }
    }

    ...
}

tcp_conn_sched

static int tcp_conn_sched(struct dp_vs_proto *proto,
                          const struct dp_vs_iphdr *iph,
                          struct rte_mbuf *mbuf,
                          struct dp_vs_conn **conn,
                          int *verdict)
{
    struct tcphdr *th, _tcph;
    struct dp_vs_service *svc;

    assert(proto && iph && mbuf && conn && verdict);

	//获取tcp header,只是指针操作,不涉及数据复制
    th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
    if (unlikely(!th)) {
        *verdict = INET_DROP;
        return EDPVS_INVPKT;
    }

    /* Syn-proxy step 2 logic: receive client's 3-handshake ack packet */
    /* When synproxy disabled, only SYN packets can arrive here.
     * So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy
     * got disbled and keepalived reloaded, SYN packets for RS may never be sent. */
    if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0) {
        /* Attention: First ACK packet is also stored in conn->ack_mbuf */
        return EDPVS_PKTSTOLEN;
    }

    /* only TCP-SYN without other flag can be scheduled */
	//对于新建立的连接,只允许syn请求,其他的抛弃
    if (!th->syn || th->ack || th->fin || th->rst) {
#ifdef CONFIG_DPVS_IPVS_DEBUG
        char dbuf[64], sbuf[64];
        const char *daddr, *saddr;

        daddr = inet_ntop(iph->af, &iph->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::";
        saddr = inet_ntop(iph->af, &iph->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::";
        RTE_LOG(DEBUG, IPVS,
                "%s: [%d] try sched non-SYN packet: [%c%c%c%c] %s/%d->%s/%d\n",
                __func__, rte_lcore_id(),
                th->syn ? 'S' : '.', th->fin ? 'F' : '.',
                th->ack ? 'A' : '.', th->rst ? 'R' : '.',
                saddr, ntohs(th->source), daddr, ntohs(th->dest));
#endif

        /* Drop tcp packet which is send to vip and !vport */
        if (g_defence_tcp_drop &&
                (svc = dp_vs_vip_lookup(iph->af, iph->proto,
                                    &iph->daddr, rte_lcore_id()))) {
            dp_vs_estats_inc(DEFENCE_TCP_DROP);
            *verdict = INET_DROP;
            return EDPVS_INVPKT;
        }

        *verdict = INET_ACCEPT;
        return EDPVS_INVAL;
    }

	//根据请求目的地址和端口来查找服务,找不到丢弃
    svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, th->dest,
                               0, mbuf, NULL, rte_lcore_id());
    if (!svc) {
        /* Drop tcp packet which is send to vip and !vport */
        if (g_defence_tcp_drop &&
                (svc = dp_vs_vip_lookup(iph->af, iph->proto,
                                   &iph->daddr, rte_lcore_id()))) {
            dp_vs_estats_inc(DEFENCE_TCP_DROP);
            *verdict = INET_DROP;
            return EDPVS_INVPKT;
        }
        *verdict = INET_ACCEPT;
        return EDPVS_NOSERV;
    }

	//根据服务来选择rs建立连接
    *conn = dp_vs_schedule(svc, iph, mbuf, false);
    if (!*conn) {
        *verdict = INET_DROP;
        return EDPVS_RESOURCE;
    }

    return EDPVS_OK;
}

dp_vs_synproxy_ack_rcv

/* Syn-proxy step 2 logic: receive client's Ack
 * Receive client's 3-handshakes ack packet, do cookie check and then
 * send syn to rs after creating a session */
int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf,
        struct tcphdr *th, struct dp_vs_proto *pp,
        struct dp_vs_conn **cpp,
        const struct dp_vs_iphdr *iph, int *verdict)
{
    int res;
    struct dp_vs_synproxy_opt opt;
    struct dp_vs_service *svc;
    int res_cookie_check;

    /* Do not check svc syn-proxy flag, as it may be changed after syn-proxy step 1. */
    if (!th->syn && th->ack && !th->rst && !th->fin &&
            (svc = dp_vs_service_lookup(af, iph->proto, &iph->daddr,
                           th->dest, 0, NULL, NULL, rte_lcore_id()))) {
        if (dp_vs_synproxy_ctrl_defer &&
                !syn_proxy_ack_has_data(mbuf, iph, th)) {
            /* Update statistics */
            dp_vs_estats_inc(SYNPROXY_NULL_ACK);
            /* We get a pure ack when expecting ack packet with payload, so
             * have to drop it */
            *verdict = INET_DROP;
            return 0;
        }

		//syn cookie校验,如果不匹配,那么就是攻击或者无效流量,将包丢弃;
        if (AF_INET6 == af)
            res_cookie_check = syn_proxy_v6_cookie_check(mbuf,
                    ntohl(th->ack_seq) - 1, &opt);
        else
            res_cookie_check = syn_proxy_v4_cookie_check(mbuf,
                    ntohl(th->ack_seq) - 1, &opt);
        if (!res_cookie_check) {
            /* Update statistics */
            dp_vs_estats_inc(SYNPROXY_BAD_ACK);
            /* Cookie check failed, drop the packet */
            RTE_LOG(DEBUG, IPVS, "%s: syn_cookie check failed seq=%u\n", __func__,
                    ntohl(th->ack_seq) - 1);
            if (EDPVS_OK == syn_proxy_send_tcp_rst(af, mbuf)) {
                *verdict = INET_STOLEN;
            } else {
                *verdict = INET_DROP;
            }
            return 0;
        }

        ...
}

4、synproxy与rs的第一次握手:synproxy ——> rs SYN报文

syn cookie校验如果成功,执行synproxy第二阶段,lb调用dp_vs_schedule与后端rs建立连接

dp_vs_synproxy_ack_rcv --> syn_proxy_send_rs_syn

dp_vs_synproxy_ack_rcv

int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf,
        struct tcphdr *th, struct dp_vs_proto *pp,
        struct dp_vs_conn **cpp,
        const struct dp_vs_iphdr *iph, int *verdict)
{
    ...

        /* Update statistics */
        dp_vs_estats_inc(SYNPROXY_OK_ACK);

        /* Let the virtual server select a real server for the incoming connetion,
         * and create a connection entry */
        //dp_vs_schedule新建立连接后端调度,选择了一个rs
        *cpp = dp_vs_schedule(svc, iph, mbuf, 1);
        if (unlikely(!*cpp)) {
            RTE_LOG(WARNING, IPVS, "%s: ip_vs_schedule failed\n", __func__);
            /* FIXME: What to do when virtual service is available but no destination
             * available for a new connetion: send an icmp UNREACHABLE ? */
            *verdict = INET_DROP;
            return 0;
        }

        if (opt.wscale_ok)
            (*cpp)->wscale_vs = dp_vs_synproxy_ctrl_wscale;

        /* Do nothing but print a error msg when fail, because session will be
         * correctly freed in dp_vs_conn_expire */
        //syn_proxy_send_rs_syn完成lb与rs建连
        if (EDPVS_OK != (res = syn_proxy_send_rs_syn(af, th, *cpp, mbuf, pp, &opt))) {
            RTE_LOG(ERR, IPVS, "%s: syn_proxy_send_rs_syn failed -- %s\n",
                    __func__, dpvs_strerror(res));
        }

        /* Count in the ack packet (STOLEN by synproxy) */
        dp_vs_stats_in(*cpp, mbuf);

        /* Active session timer, and dec refcnt.
         * Also steal the mbuf, and let caller return immediately */
        dp_vs_conn_put(*cpp);
        *verdict = INET_STOLEN;
        return 0;
    }

    return 1;
}

syn_proxy_send_rs_syn

/* Create syn packet and send it to rs.
 * We also store syn mbuf in cp if syn retransmition is turned on. */
static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th,
        struct dp_vs_conn *cp, struct rte_mbuf *mbuf,
        struct dp_vs_proto *pp, struct dp_vs_synproxy_opt *opt)
{
    int tcp_hdr_size;
    struct rte_mbuf *syn_mbuf, *syn_mbuf_cloned;
    struct rte_mempool *pool;
    struct tcphdr *syn_th;

    if (!cp->packet_xmit) {
        RTE_LOG(WARNING, IPVS, "%s: packet_xmit is null\n", __func__);
        return EDPVS_INVAL;
    }

    /* Allocate mbuf from device mempool */
    pool = get_mbuf_pool(cp, DPVS_CONN_DIR_INBOUND);
    if (unlikely(!pool)) {
        //RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOROUTE));
        return EDPVS_NOROUTE;
    }

	//从内存池中分配syn_mbuf,用于发送到后端rs
    syn_mbuf = rte_pktmbuf_alloc(pool);
    if (unlikely(!syn_mbuf)) {
        //RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOMEM));
        return EDPVS_NOMEM;
    }
	//设置路由缓存为null
    mbuf_userdata_reset(syn_mbuf);  /* make sure "no route info" */

    /* Reserve space for tcp header */
	//为tcp层保留空间,包括选项,通过prepend向mbuf的headroom添加选项
    tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MAXSEG
            + (opt->tstamp_ok ? TCPOLEN_TSTAMP_APPA : 0)
            + (opt->wscale_ok ? TCP_OLEN_WSCALE_ALIGNED : 0)
            /* SACK_PERM is in the palce of NOP NOP of TS */
            + ((opt->sack_ok && !opt->tstamp_ok) ? TCP_OLEN_SACKPERMITTED_ALIGNED : 0));
    syn_th = (struct tcphdr *)rte_pktmbuf_prepend(syn_mbuf, tcp_hdr_size);
    if (!syn_th) {
        rte_pktmbuf_free(syn_mbuf);
        //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM));
        return EDPVS_NOROOM;
    }

    /* Set up tcp header */
	//tcp首部的构造
    memset(syn_th, 0, tcp_hdr_size);
    syn_th->source = th->source;
    syn_th->dest = th->dest;
    syn_th->seq = htonl(ntohl(th->seq) - 1);
    syn_th->ack_seq = 0;
    *(((uint16_t *) syn_th) + 6) = htons(((tcp_hdr_size >> 2) << 12) | /*TH_SYN*/ 0x02);
    /* FIXME: what window should we use */
    syn_th->window = htons(5000);
    syn_th->check = 0;
    syn_th->urg_ptr = 0;
    syn_th->urg = 0;
	//构造syn包的tcp选项
    syn_proxy_syn_build_options((uint32_t *)(syn_th + 1), opt);

	//ip首部的构造
    if (AF_INET6 == af) {
        struct ip6_hdr *ack_ip6h;
        struct ip6_hdr *syn_ip6h;

        /* Reserve space for ipv6 header */
        syn_ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(syn_mbuf,
                sizeof(struct ip6_hdr));
        if (!syn_ip6h) {
            rte_pktmbuf_free(syn_mbuf);
            //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM));
            return EDPVS_NOROOM;
        }

        ack_ip6h = (struct ip6_hdr *)ip6_hdr(mbuf);

        syn_ip6h->ip6_vfc = 0x60;  /* IPv6 */
        syn_ip6h->ip6_src = ack_ip6h->ip6_src;
        syn_ip6h->ip6_dst = ack_ip6h->ip6_dst;
        syn_ip6h->ip6_plen = htons(tcp_hdr_size);
        syn_ip6h->ip6_nxt = NEXTHDR_TCP;
        syn_ip6h->ip6_hlim = IPV6_DEFAULT_HOPLIMIT;

        syn_mbuf->l3_len = sizeof(*syn_ip6h);
    } else {
        struct iphdr *ack_iph;
        struct iphdr *syn_iph;

        /* Reserve space for ipv4 header */
        syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct rte_ipv4_hdr));
        if (!syn_iph) {
            rte_pktmbuf_free(syn_mbuf);
            //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM));
            return EDPVS_NOROOM;
        }

        ack_iph = (struct iphdr *)ip4_hdr(mbuf);
        *((uint16_t *) syn_iph) = htons((4 << 12) | (5 << 8) | (ack_iph->tos & 0x1E));
        syn_iph->tot_len = htons(syn_mbuf->pkt_len);
        syn_iph->frag_off = htons(RTE_IPV4_HDR_DF_FLAG);
        syn_iph->ttl = 64;
        syn_iph->protocol = IPPROTO_TCP;
        syn_iph->saddr = ack_iph->saddr;
        syn_iph->daddr = ack_iph->daddr;

        syn_mbuf->l3_len = sizeof(*syn_iph);

        /* checksum is done by fnat_in_handler */
        syn_iph->check = 0;
    }

    /* Save syn_mbuf if syn retransmission is on */
	//dp_vs_synproxy_ctrl_syn_retry,主动连接时的超时重传次数,如果大于0,将构造的数据报缓存起来
    if (dp_vs_synproxy_ctrl_syn_retry > 0) {
        syn_mbuf_cloned = mbuf_copy(syn_mbuf, pool);
        if (unlikely(!syn_mbuf_cloned)) {
            rte_pktmbuf_free(syn_mbuf);
            //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOMEM));
            return EDPVS_NOMEM;
        }

        mbuf_userdata_reset(syn_mbuf_cloned);
        cp->syn_mbuf = syn_mbuf_cloned;
        sp_dbg_stats32_inc(sp_syn_saved);
        rte_atomic32_set(&cp->syn_retry_max, dp_vs_synproxy_ctrl_syn_retry);
    }

    /* TODO: Save info for fast_response_xmit */

    /* Count in the syn packet */
    dp_vs_stats_in(cp, mbuf);

    /* If xmit failed, syn_mbuf will be freed correctly */
    cp->packet_xmit(pp, cp, syn_mbuf);

    return EDPVS_OK;
}

5、synproxy与rs的第二次握手:rs ——> synproxy SYNACK报文

__dp_vs_in --> dp_vs_synproxy_synack_rcv

__dp_vs_in

static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
  ...

    if (conn->flags & DPVS_CONN_F_SYNPROXY) {
        if (dir == DPVS_CONN_DIR_INBOUND) {
            /* Filter out-in ack packet when cp is at SYN_SENT state.
             * Drop it if not a valid packet, store it otherwise */
            if (0 == dp_vs_synproxy_filter_ack(mbuf, conn, prot,
                                               &iph, &verdict)) {
                dp_vs_stats_in(conn, mbuf);
                dp_vs_conn_put(conn);
                return verdict;
            }

            /* "Reuse" synproxy sessions.
             * "Reuse" means update syn_proxy_seq struct
             * and clean ack_mbuf etc. */
            if (0 != dp_vs_synproxy_ctrl_conn_reuse) {
                if (0 == dp_vs_synproxy_reuse_conn(af, mbuf, conn, prot,
                                                   &iph, &verdict)) {
                    dp_vs_stats_in(conn, mbuf);
                    dp_vs_conn_put(conn);
                    return verdict;
                }
            }
        } else {
            /* Syn-proxy 3 logic: receive syn-ack from rs */
            if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot,
                                          iph.len, &verdict) == 0) {
                dp_vs_stats_out(conn, mbuf);
                dp_vs_conn_put(conn);
                return verdict;
            }
        }
    }

	//tcp状态转移
    if (prot->state_trans) {
        err = prot->state_trans(prot, conn, mbuf, dir);
        if (err != EDPVS_OK)
            RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
    }
    conn->old_state = conn->state;

    /* holding the conn, need a "put" later. */
	//根据流量方向写数据
    if (dir == DPVS_CONN_DIR_INBOUND)
        return xmit_inbound(mbuf, prot, conn);
    else
        return xmit_outbound(mbuf, prot, conn);
}

dp_vs_synproxy_synack_rcv

/* Syn-proxy step 3 logic: receive rs's Syn/Ack.
 * Update syn_proxy_seq.delta and send stored ack mbufs to rs. */
int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp,
        struct dp_vs_proto *pp, int th_offset, int *verdict)
{
    struct tcphdr _tcph, *th;
    struct dp_vs_synproxy_ack_pakcet *tmbuf, *tmbuf2;
    struct list_head save_mbuf;
    struct dp_vs_dest *dest = cp->dest;

	//th指向tcp首部起始位置
    th = mbuf_header_pointer(mbuf, th_offset, sizeof(_tcph), &_tcph);
    if (unlikely(!th)) {
        *verdict = INET_DROP;
        return 0;
    }

#ifdef CONFIG_DPVS_IPVS_DEBUG
    RTE_LOG(DEBUG, IPVS, "%s: seq = %u ack_seq = %u %c%c%c cp->is_synproxy = %u "
            "cp->state = %u\n", __func__, ntohl(th->seq), ntohl(th->ack_seq),
            (th->syn) ? 'S' : '-',
            (th->ack) ? 'A' : '-',
            (th->rst) ? 'R' : '-',
            cp->flags & DPVS_CONN_F_SYNPROXY, cp->state);
#endif

    INIT_LIST_HEAD(&save_mbuf);

	//判断应答包状态,必须是syn和ack包,并且开启了synproxy,
	//当前conn连接处于DPVS_TCP_S_SYN_SENT状态
    if ((th->syn) && (th->ack) && (!th->rst) &&
            (cp->flags & DPVS_CONN_F_SYNPROXY) &&
            (cp->state == DPVS_TCP_S_SYN_SENT)) {
        cp->wscale_rs = syn_proxy_parse_wscale_opt(mbuf, th);
		//更新syn_proxy_seq.delta序列号差值
        cp->syn_proxy_seq.delta = ntohl(cp->syn_proxy_seq.isn) - ntohl(th->seq);
		//连接状态进入DPVS_TCP_S_ESTABLISHED
        cp->state = DPVS_TCP_S_ESTABLISHED;
		//获取连接超时时间
        dp_vs_conn_set_timeout(cp, pp);
        dpvs_time_rand_delay(&cp->timeout, 1000000);
		//更新dest上的连接统计时间
        if (dest) {
            rte_atomic32_inc(&dest->actconns);
            rte_atomic32_dec(&dest->inactconns);
            cp->flags &= ~DPVS_CONN_F_INACTIVE;
            dp_vs_dest_detected_alive(dest);
        }

        /* Save tcp sequence for fullnat/nat, inside to outside */
		//保存序号rs_end_seq和rs_end_ack
        if (DPVS_FWD_MODE_NAT == cp->dest->fwdmode ||
                DPVS_FWD_MODE_FNAT == cp->dest->fwdmode) {
            cp->rs_end_seq = htonl(ntohl(th->seq) + 1);
            cp->rs_end_ack = th->ack_seq;
#ifdef CONFIG_DPVS_IPVS_DEBUG
            RTE_LOG(DEBUG, IPVS, "%s: packet from rs, seq = %u, ack_seq = %u, port %u => %u\n",
                    __func__, ntohl(th->seq), ntohl(th->ack_seq),
                    ntohs(th->source), ntohs(th->dest));
#endif
        }

        /* TODO: ip_vs_synproxy_save_fast_xmit_info ? */

        /* Free stored syn mbuf, no need for retransmition any more */
		//syn_mbuf上保存了lb-->rs发起连接请求的数据报,此时连接正常完成,需要释放
        if (cp->syn_mbuf) {
            rte_pktmbuf_free(cp->syn_mbuf);
            cp->syn_mbuf = NULL;
            sp_dbg_stats32_dec(sp_syn_saved);
        }

		//在全局ack_mbuf链表中删除自己的ack_mbuf引用
        if (list_empty(&cp->ack_mbuf)) {
            /*
             * FIXME: Maybe a bug here, print err msg and go.
             * Attention: cp->state has been changed and we
             * should still DROP the syn/ack mbuf.
             */
            RTE_LOG(ERR, IPVS, "%s: got ack_mbuf NULL pointer: ack-saved = %u\n",
                    __func__, cp->ack_num);
            *verdict = INET_DROP;
            return 0;
        }

        /* Window size has been set to zero in the syn-ack packet to Client.
         * If get more than one ack packet here,
         * it means client has sent a window probe after one RTO.
         * The probe will be forward to RS and RS will respond a window update.
         * So DPVS has no need to send a window update.
         */
        //设置窗口
        if (dp_vs_synproxy_ctrl_clwnd && !dp_vs_synproxy_ctrl_defer && cp->ack_num <= 1)
            syn_proxy_send_window_update(tuplehash_out(cp).af, mbuf, cp, pp, th);

        list_for_each_entry_safe(tmbuf, tmbuf2, &cp->ack_mbuf, list) {
            list_del_init(&tmbuf->list);
            cp->ack_num--;
            list_add_tail(&tmbuf->list, &save_mbuf);
        }
        assert(cp->ack_num == 0);

		//调用packet_xmit将缓存发送至rs侧的数据报发送至rs,其中包括第三次握手的ack包
        list_for_each_entry_safe(tmbuf, tmbuf2, &save_mbuf, list) {
            list_del_init(&tmbuf->list);
            /* syn_mbuf will be freed correctly if xmit failed */
            cp->packet_xmit(pp, cp, tmbuf->mbuf);
            /* free dp_vs_synproxy_ack_pakcet */
            rte_mempool_put(this_ack_mbufpool, tmbuf);
            sp_dbg_stats32_dec(sp_ack_saved);
        }

		//这个ack连接数据报不需要发送到client侧,所以此处返回INET_DROP
        *verdict = INET_DROP;
        return 0;
    } else if ((th->rst) &&
            (cp->flags & DPVS_CONN_F_SYNPROXY) &&
            (cp->state == DPVS_TCP_S_SYN_SENT)) {
        RTE_LOG(DEBUG, IPVS, "%s: get rst from rs, seq = %u ack_seq = %u\n",
                __func__, ntohl(th->seq), ntohl(th->ack_seq));
        dp_vs_dest_detected_dead(dest);

        /* Count the delta of seq */
		//如果是rst包,设置连接状态为DPVS_TCP_S_CLOSE
        cp->syn_proxy_seq.delta = ntohl(cp->syn_proxy_seq.isn) - ntohl(th->seq);
        cp->state = DPVS_TCP_S_CLOSE;
        cp->timeout.tv_sec = pp->timeout_table[cp->state];
        dpvs_time_rand_delay(&cp->timeout, 1000000);
        th->seq = htonl(ntohl(th->seq) + 1);
        //syn_proxy_seq_csum_update ?

        return 1;
    }
    return 1;
}

6、synproxy与rs的第三次握手:synproxy ——> rs ACK报文

__dp_vs_in --> tcp_conn_sched --> dp_vs_synproxy_ack_rcv创建conn时将client与synproxy间第三次握手包ack存在ack_mbuf链表中,

__dp_vs_in --> dp_vs_synproxy_synack_rcv 将ack_mbuf中的ack包取出并发送给rs

int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp,
        struct dp_vs_proto *pp, int th_offset, int *verdict)
{
    ...

		//调用packet_xmit将缓存发送至rs侧的数据报发送至rs,其中包括第三次握手的ack包
        list_for_each_entry_safe(tmbuf, tmbuf2, &save_mbuf, list) {
            list_del_init(&tmbuf->list);
            /* syn_mbuf will be freed correctly if xmit failed */
            cp->packet_xmit(pp, cp, tmbuf->mbuf);
            /* free dp_vs_synproxy_ack_pakcet */
            rte_mempool_put(this_ack_mbufpool, tmbuf);
            sp_dbg_stats32_dec(sp_ack_saved);
        }
	...
}

参考文章:

https://www.jianshu.com/p/c303d0cf0cdd

https://blog.51cto.com/u_13959518/7126121

https://blog.csdn.net/zjx345438858/article/details/108106143

posted @ 2025-01-05 20:05  调蓝师  阅读(266)  评论(0)    收藏  举报