七 vxlan发包流程
2017-03-23 15:01 yrpapa 阅读(748) 评论(0) 收藏 举报vxlan发包流程
发包处理函数最终会调用到ovs_vport_send函数,该函数最终会调用vport_ops的send函数。
void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, packet_length(skb), mtu); vport->dev->stats.tx_errors++; goto drop; } skb->dev = vport->dev; vport->ops->send(skb); //实际调用ovs_vxlan_netdev_vport_ops的vxlan_xmit函数 return; drop: kfree_skb(skb); }
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct vxlan_dev *vxlan = netdev_priv(dev); const struct ip_tunnel_info *info; info = skb_tunnel_info(skb); skb_reset_mac_header(skb); if (vxlan->flags & VXLAN_F_COLLECT_METADATA) { if (info && info->mode & IP_TUNNEL_INFO_TX) { vxlan_xmit_one(skb, dev, NULL, false); return NETDEV_TX_OK; } } dev->stats.tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; }
static inline struct ip_tunnel_info *ovs_skb_tunnel_info(struct sk_buff *skb) { if (likely(OVS_GSO_CB(skb)->tun_dst)) return &OVS_GSO_CB(skb)->tun_dst->u.tun_info; else return NULL; }
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk; struct rtable *rt = NULL; const struct iphdr *old_iph; union vxlan_addr *dst; union vxlan_addr remote_ip, local_ip; union vxlan_addr *src; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; __be32 vni, label; __be16 df = 0; __u8 tos, ttl; int err; u32 flags = vxlan->flags; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); info = skb_tunnel_info(skb); // 通道信息 if (rdst) { //不进入此分支 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = rdst->remote_vni; dst = &rdst->remote_ip; src = &vxlan->cfg.saddr; dst_cache = &rdst->dst_cache; } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; //目的端口优先从tunnel info中获取 vni = vxlan_tun_id_to_vni(info->key.tun_id); //VNI(通道号)信息从tunnel info中获取,取32位 remote_ip.sa.sa_family = ip_tunnel_info_af(info); // 区分IP V4/V6 if (remote_ip.sa.sa_family == AF_INET) { remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; // 源地址 local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; // 目的地址 } else { remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; local_ip.sin6.sin6_addr = info->key.u.ipv6.src; } dst = &remote_ip; src = &local_ip; dst_cache = &info->dst_cache; } if (vxlan_addr_any(dst)) { // 目的IP地址为全零,当前不支持
if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); return; } goto drop; } old_iph = ip_hdr(skb); ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); label = vxlan->cfg.label; src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); //计算源端口 if (info) { ttl = info->key.ttl; tos = info->key.tos; label = info->key.label; udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); if (info->options_len) md = ip_tunnel_info_opts(info); } else { md->gbp = skb->mark; } if (dst->sa.sa_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); if (!sock4) goto drop; sk = sock4->sock->sk; rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, dst->sin.sin_addr.s_addr, &src->sin.sin_addr.s_addr, dst_cache, info); //路由表查找 if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to %pI4\n", &dst->sin.sin_addr.s_addr); dev->stats.collisions++; goto rt_tx_error; } /* Bypass encapsulation if the destination is local */ if (!info && rt->rt_flags & RTCF_LOCAL && !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { //正常场景不进入此分支 struct vxlan_dev *dst_vxlan; ip_rt_put(rt); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } if (!info) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) df = htons(IP_DF); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) goto xmit_tx_error; udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr, dst->sin.sin_addr.s_addr, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); // 最终调用内核的ip_local_out发出 #if IS_ENABLED(CONFIG_IPV6) } else { // ipv6 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; u32 rt6i_flags; if (!sock6) goto drop; sk = sock6->sock->sk; ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, label, &dst->sin6.sin6_addr, &src->sin6.sin6_addr, dst_cache, info); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; goto tx_error; } if (ndst->dev == dev) { netdev_dbg(dev, "circular route to %pI6\n", &dst->sin6.sin6_addr); dst_release(ndst); dev->stats.collisions++; goto tx_error; } /* Bypass encapsulation if the destination is local */ rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; if (!info && rt6i_flags & RTF_LOCAL && !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; dst_release(ndst); dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst->sa.sa_family, dst_port, vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } if (!info) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) { dst_release(ndst); return; } udp_tunnel6_xmit_skb(ndst, sk, skb, dev, &src->sin6.sin6_addr, &dst->sin6.sin6_addr, tos, ttl, label, src_port, dst_port, !udp_sum); #endif } return; drop: dev->stats.tx_dropped++; goto tx_free; xmit_tx_error: /* skb is already freed. */ skb = NULL; rt_tx_error: ip_rt_put(rt); tx_error: dev->stats.tx_errors++; tx_free: dev_kfree_skb(skb); }
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, u8 tos, __be32 daddr, __be32 *saddr, struct dst_cache *dst_cache, const struct ip_tunnel_info *info) { bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct rtable *rt = NULL; struct flowi4 fl4; if (tos && !info) use_cache = false; if (use_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; } memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = oif; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.daddr = daddr; fl4.saddr = *saddr; rt = ip_route_output_key(vxlan->net, &fl4); if (!IS_ERR(rt)) { *saddr = fl4.saddr; if (use_cache) dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); } return rt; }
struct rtable { struct dst_entry dst; int rt_genid; unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; __u8 rt_uses_gateway; int rt_iif; /* Info on neighbour */ __be32 rt_gateway; /* Miscellaneous cached information */ u32 rt_pmtu; u32 rt_table_id; struct list_head rt_uncached; struct uncached_list *rt_uncached_list; };
struct dst_entry { struct rcu_head rcu_head; struct dst_entry *child; struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; struct dst_entry *path; struct dst_entry *from; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOHASH 0x0008 #define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 #define DST_FAKE_RTABLE 0x0040 #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 #define DST_METADATA 0x0200 unsigned short pending_confirm; short error; /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; #endif #ifdef CONFIG_64BIT /* * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ long __pad_to_align_refcnt[2]; #endif /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; struct lwtunnel_state *lwtstate; union { struct dst_entry *next; struct rtable __rcu *rt_next; struct rt6_info *rt6_next; struct dn_route __rcu *dn_next; }; };
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, u8 tos, __be32 daddr, __be32 *saddr, struct dst_cache *dst_cache, const struct ip_tunnel_info *info) { bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct rtable *rt = NULL; struct flowi4 fl4; if (tos && !info) use_cache = false; if (use_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; } memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = oif; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.daddr = daddr; fl4.saddr = *saddr; rt = ip_route_output_key(vxlan->net, &fl4); if (!IS_ERR(rt)) { *saddr = fl4.saddr; if (use_cache) dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); } return rt; }
static inline struct rtable *rpl_ip_route_output_key(struct net *net, struct flowi4 *flp) { struct rtable *rt; /* Tunnel configuration keeps DSCP part of TOS bits, But Linux * router expect RT_TOS bits only. */ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = flp->daddr, .saddr = flp->saddr, .tos = RT_TOS(flp->flowi4_tos) } }, .mark = flp->flowi4_mark, .proto = flp->flowi4_proto }; if (unlikely(ip_route_output_key(net, &rt, &fl))) // 调用内核的ip_route_output_key查找路由,返回rtable return ERR_PTR(-EADDRNOTAVAIL); flp->saddr = fl.nl_u.ip4_u.saddr; return rt; }
浙公网安备 33010602011771号