代码改变世界

七 vxlan发包流程

2017-03-23 15:01  yrpapa  阅读(748)  评论(0)    收藏  举报

vxlan发包流程

发包处理函数最终会调用到ovs_vport_send函数,该函数最终会调用vport_ops的send函数。

 

void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
    int mtu = vport->dev->mtu;

    if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
        net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
                     vport->dev->name,
                     packet_length(skb), mtu);
        vport->dev->stats.tx_errors++;
        goto drop;
    }   

    skb->dev = vport->dev;
    vport->ops->send(skb); //实际调用ovs_vxlan_netdev_vport_ops的vxlan_xmit函数
    return;

drop:
    kfree_skb(skb);
}

 

netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
    struct net_device *dev = skb->dev;
    struct vxlan_dev *vxlan = netdev_priv(dev);
    const struct ip_tunnel_info *info;

    info = skb_tunnel_info(skb);
    skb_reset_mac_header(skb);
    if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
        if (info && info->mode & IP_TUNNEL_INFO_TX) {
            vxlan_xmit_one(skb, dev, NULL, false);
            return NETDEV_TX_OK;
        }
    }    

    dev->stats.tx_dropped++;
    kfree_skb(skb);
    return NETDEV_TX_OK;
}

 

static inline struct ip_tunnel_info *ovs_skb_tunnel_info(struct sk_buff *skb)
{
    if (likely(OVS_GSO_CB(skb)->tun_dst))
        return &OVS_GSO_CB(skb)->tun_dst->u.tun_info;
    else
        return NULL;
}
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
               struct vxlan_rdst *rdst, bool did_rsc)
{
    struct dst_cache *dst_cache;
    struct ip_tunnel_info *info;
    struct vxlan_dev *vxlan = netdev_priv(dev);
    struct sock *sk;
    struct rtable *rt = NULL;
    const struct iphdr *old_iph;
    union vxlan_addr *dst;
    union vxlan_addr remote_ip, local_ip;
    union vxlan_addr *src;
    struct vxlan_metadata _md;
    struct vxlan_metadata *md = &_md;
    __be16 src_port = 0, dst_port;
    __be32 vni, label;
    __be16 df = 0;
    __u8 tos, ttl;
    int err;
    u32 flags = vxlan->flags;
    bool udp_sum = false;
    bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));

    info = skb_tunnel_info(skb); // 通道信息

    if (rdst) { //不进入此分支
        dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
        vni = rdst->remote_vni;
        dst = &rdst->remote_ip;
        src = &vxlan->cfg.saddr;
        dst_cache = &rdst->dst_cache;
    } else {
        if (!info) {
            WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
                  dev->name);
            goto drop;
        }
        dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; //目的端口优先从tunnel info中获取
        vni = vxlan_tun_id_to_vni(info->key.tun_id); //VNI(通道号)信息从tunnel info中获取,取32位
        remote_ip.sa.sa_family = ip_tunnel_info_af(info); // 区分IP V4/V6
        if (remote_ip.sa.sa_family == AF_INET) {
            remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; // 源地址
            local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; // 目的地址
        } else {
            remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
            local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
        }
        dst = &remote_ip;
        src = &local_ip;
        dst_cache = &info->dst_cache;
    }

    if (vxlan_addr_any(dst)) { // 目的IP地址为全零,当前不支持 
if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); return; } goto drop; } old_iph = ip_hdr(skb); ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); label = vxlan->cfg.label; src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); //计算源端口 if (info) { ttl = info->key.ttl; tos = info->key.tos; label = info->key.label; udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); if (info->options_len) md = ip_tunnel_info_opts(info); } else { md->gbp = skb->mark; } if (dst->sa.sa_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); if (!sock4) goto drop; sk = sock4->sock->sk; rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, dst->sin.sin_addr.s_addr, &src->sin.sin_addr.s_addr, dst_cache, info); //路由表查找 if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.collisions++; goto rt_tx_error; } /* Bypass encapsulation if the destination is local */ if (!info && rt->rt_flags & RTCF_LOCAL && !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { //正常场景不进入此分支 struct vxlan_dev *dst_vxlan; ip_rt_put(rt); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } if (!info) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) df = htons(IP_DF); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) goto xmit_tx_error; udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr, dst->sin.sin_addr.s_addr, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); // 最终调用内核的ip_local_out发出 #if IS_ENABLED(CONFIG_IPV6) } else { // ipv6 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; u32 rt6i_flags; if (!sock6) goto drop; sk = sock6->sock->sk; ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, label, &dst->sin6.sin6_addr, &src->sin6.sin6_addr, dst_cache, info); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (ndst->dev == dev) { netdev_dbg(dev, "circular route to %pI6\n", &dst->sin6.sin6_addr); dst_release(ndst); dev->stats.collisions++; goto tx_error; } /* Bypass encapsulation if the destination is local */ rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; if (!info && rt6i_flags & RTF_LOCAL && !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; dst_release(ndst); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } if (!info) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) { dst_release(ndst); return; } udp_tunnel6_xmit_skb(ndst, sk, skb, dev, &src->sin6.sin6_addr, &dst->sin6.sin6_addr, tos, ttl, label, src_port, dst_port, !udp_sum); #endif } return; drop: dev->stats.tx_dropped++; goto tx_free; xmit_tx_error: /* skb is already freed. */ skb = NULL; rt_tx_error: ip_rt_put(rt); tx_error: dev->stats.tx_errors++; tx_free: dev_kfree_skb(skb); }
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
                      struct sk_buff *skb, int oif, u8 tos,
                      __be32 daddr, __be32 *saddr,
                      struct dst_cache *dst_cache,
                      const struct ip_tunnel_info *info)
{
    bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
    struct rtable *rt = NULL;
    struct flowi4 fl4;

    if (tos && !info)
        use_cache = false;
    if (use_cache) {
        rt = dst_cache_get_ip4(dst_cache, saddr);
        if (rt)
            return rt;
    }

    memset(&fl4, 0, sizeof(fl4));
    fl4.flowi4_oif = oif;
    fl4.flowi4_tos = RT_TOS(tos);
    fl4.flowi4_mark = skb->mark;
    fl4.flowi4_proto = IPPROTO_UDP;
    fl4.daddr = daddr;
    fl4.saddr = *saddr;

    rt = ip_route_output_key(vxlan->net, &fl4);
    if (!IS_ERR(rt)) {
        *saddr = fl4.saddr;
        if (use_cache)
            dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
    }
    return rt;
}
struct rtable {
    struct dst_entry    dst;

    int         rt_genid;
    unsigned int        rt_flags;
    __u16           rt_type;
    __u8            rt_is_input;
    __u8            rt_uses_gateway;

    int         rt_iif;

    /* Info on neighbour */
    __be32          rt_gateway;

    /* Miscellaneous cached information */
    u32         rt_pmtu;

    u32         rt_table_id;

    struct list_head    rt_uncached;
    struct uncached_list    *rt_uncached_list;
};
struct dst_entry {
    struct rcu_head     rcu_head;
    struct dst_entry    *child;
    struct net_device       *dev;
    struct  dst_ops         *ops;
    unsigned long       _metrics;
    unsigned long           expires;
    struct dst_entry    *path;
    struct dst_entry    *from;
#ifdef CONFIG_XFRM
    struct xfrm_state   *xfrm;
#else
    void            *__pad1;
#endif
    int         (*input)(struct sk_buff *);
    int         (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

    unsigned short      flags;
#define DST_HOST        0x0001
#define DST_NOXFRM      0x0002
#define DST_NOPOLICY        0x0004
#define DST_NOHASH      0x0008
#define DST_NOCACHE     0x0010
#define DST_NOCOUNT     0x0020
#define DST_FAKE_RTABLE     0x0040
#define DST_XFRM_TUNNEL     0x0080
#define DST_XFRM_QUEUE      0x0100
#define DST_METADATA        0x0200

    unsigned short      pending_confirm;

    short           error;


    /* A non-zero value of dst->obsolete forces by-hand validation
     * of the route entry.  Positive values are set by the generic
     * dst layer to indicate that the entry has been forcefully
     * destroyed.
     *
     * Negative values are used by the implementation layer code to
     * force invocation of the dst_ops->check() method.
     */
    short           obsolete;
#define DST_OBSOLETE_NONE   0
#define DST_OBSOLETE_DEAD   2
#define DST_OBSOLETE_FORCE_CHK  -1
#define DST_OBSOLETE_KILL   -2
    unsigned short      header_len; /* more space at head required */
    unsigned short      trailer_len;    /* space to reserve at tail */
#ifdef CONFIG_IP_ROUTE_CLASSID
    __u32           tclassid;
#else
    __u32           __pad2;
#endif

#ifdef CONFIG_64BIT
    /*
     * Align __refcnt to a 64 bytes alignment
     * (L1_CACHE_SIZE would be too much)
     */
    long            __pad_to_align_refcnt[2];
#endif
    /*
     * __refcnt wants to be on a different cache line from
     * input/output/ops or performance tanks badly
     */
    atomic_t        __refcnt;   /* client references    */
    int         __use;
    unsigned long       lastuse;
    struct lwtunnel_state   *lwtstate;
    union {
        struct dst_entry    *next;
        struct rtable __rcu *rt_next;
        struct rt6_info     *rt6_next;
        struct dn_route __rcu   *dn_next;
    };
};
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
                      struct sk_buff *skb, int oif, u8 tos,
                      __be32 daddr, __be32 *saddr,
                      struct dst_cache *dst_cache,
                      const struct ip_tunnel_info *info)
{
    bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
    struct rtable *rt = NULL;
    struct flowi4 fl4;

    if (tos && !info)
        use_cache = false;
    if (use_cache) {
        rt = dst_cache_get_ip4(dst_cache, saddr);
        if (rt)
            return rt;
    }

    memset(&fl4, 0, sizeof(fl4));
    fl4.flowi4_oif = oif;
    fl4.flowi4_tos = RT_TOS(tos);
    fl4.flowi4_mark = skb->mark;
    fl4.flowi4_proto = IPPROTO_UDP;
    fl4.daddr = daddr;
    fl4.saddr = *saddr;
                    
    rt = ip_route_output_key(vxlan->net, &fl4);
    if (!IS_ERR(rt)) {
        *saddr = fl4.saddr;
        if (use_cache)
            dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
    }   
    return rt; 
}   
static inline struct rtable *rpl_ip_route_output_key(struct net *net, struct flowi4 *flp)
{
    struct rtable *rt;
    /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
     * router expect RT_TOS bits only.
     */ 
    
    struct flowi fl = { .nl_u = { .ip4_u = {
                    .daddr = flp->daddr,
                    .saddr = flp->saddr,
                    .tos   = RT_TOS(flp->flowi4_tos) } },
                    .mark = flp->flowi4_mark,
                    .proto = flp->flowi4_proto };
    
    if (unlikely(ip_route_output_key(net, &rt, &fl))) // 调用内核的ip_route_output_key查找路由,返回rtable
        return ERR_PTR(-EADDRNOTAVAIL);
    flp->saddr = fl.nl_u.ip4_u.saddr;
    return rt;
}