fib 结构以及路由 fib 创建

见之前的文章fib数据结构

 路由fib创建

当通过netlink,操作类型为RTM_NEWROUTE时,调用inet_rtm_newroute函数添加路由。

功能:a)、将用户空间配置内容传过来------rtm_to_fib_config(保存路由表项的 ip 类型  tos  出接口 网关 metric 目的ip等信息)

         b)、路由表的创建------------------fib_new_table(根据table id 查找创建路由表)

         c)、路由表项的添加-----------------fib_table_insert

 

static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
                 struct nlmsghdr *nlh, struct fib_config *cfg)
{
    struct nlattr *attr;
    int err, remaining;
    struct rtmsg *rtm;

    err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);

    memset(cfg, 0, sizeof(*cfg));
    //跳过nlh的硬件头部,让rtm指向nlh的内容,即将nlh赋值给rtm
    rtm = nlmsg_data(nlh);
    //将rtm的内容,赋值给cfg
    cfg->fc_dst_len = rtm->rtm_dst_len;//掩码长度
    cfg->fc_tos = rtm->rtm_tos; //好像是默认为0
    cfg->fc_table = rtm->rtm_table;//路由表id: connected为0;kernel route为255      //如果id为0,kernel会将id设为254
    cfg->fc_protocol = rtm->rtm_protocol;//协议类型:connected和kernel route都为11
    cfg->fc_scope = rtm->rtm_scope;//范围:connected为253;kernel route为254
    cfg->fc_type = rtm->rtm_type;//类型:connected为1;kernel route为2
    cfg->fc_flags = rtm->rtm_flags;//connected和kernel route都为1024
    cfg->fc_nlflags = nlh->nlmsg_flags;
-------------------------------------------------
}

  在添加表项时,没有指定路由表ID,或者指定的表ID等于0,内核使用main表RT_TABLE_MAIN,函数fib_trie_table分配一个新的fib_table结构,代表一个新的路由表。对于main表,将其制赋值给命名空间中的fib_main成员。最后将其链接到哈希桶fib_table_hash的对应链表中。

如果指定的路由表ID等于RT_TABLE_LOCAL,但是此命名空间中没有配置过IPv4策略路由,也使用main路由表,作为alias;也就是local表也用main表

  • 对于main路由表,以及其它路由表,fib_trie_table的参数alias为空;
  • 但是对于local路由表,alias执向main表结构,就不用重新分配trie结构了。
  • 对于所有的路由表,都需要分配一个fib_table结构。
struct fib_table *fib_new_table(struct net *net, u32 id)
{
    struct fib_table *tb, *alias = NULL;
    unsigned int h;

    if (id == 0)
        id = RT_TABLE_MAIN;
    tb = fib_get_table(net, id);
    if (tb)
        return tb;

    if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
        alias = fib_new_table(net, RT_TABLE_MAIN);

    tb = fib_trie_table(id, alias);
    if (!tb)
        return NULL;

    switch (id) {
    case RT_TABLE_MAIN:
        rcu_assign_pointer(net->ipv4.fib_main, tb);
        break;
    case RT_TABLE_DEFAULT:
        rcu_assign_pointer(net->ipv4.fib_default, tb);
        break;
    default:
        break;
    }

    h = id & (FIB_TABLE_HASHSZ - 1);
    hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
    return tb;
}

 

 

 

/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
    struct trie *t = (struct trie *)tb->tb_data;
    struct fib_alias *fa, *new_fa;
    struct key_vector *l, *tp;
    unsigned int nlflags = 0;
    struct fib_info *fi;
    u8 plen = cfg->fc_dst_len;
    u8 slen = KEYLENGTH - plen;
    u8 tos = cfg->fc_tos;
    u32 key;
    int err;

    if (plen > KEYLENGTH)// IPv4的掩码长度当然最大只能32
        return -EINVAL;

    key = ntohl(cfg->fc_dst);

    pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);

    if ((plen < KEYLENGTH) && (key << plen))
        return -EINVAL;
     // 根据路由配置信息查询或者新建fib_info对象
    fi = fib_create_info(cfg);
    if (IS_ERR(fi)) {
        err = PTR_ERR(fi);
        goto err;
    }
     // 根据网络号key寻找对应的node,如果node不存在,那么alias必然也不存在,
    // 否则根据tos和优先级寻找匹配的alias
    l = fib_find_node(t, &tp, key);
    fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
                tb->tb_id) : NULL;

    /* Now fa, if non-NULL, points to the first fib alias
     * with the same keys [prefix,tos,priority], if such key already
     * exists or to the node before which we will insert new one.
     *
     * If fa is NULL, we will need to allocate a new one and
     * insert to the tail of the section matching the suffix length
     * of the new alias.
     */
    // 如注释所述,路由项的目的地址网络号、tos、优先级三个字段决定了是否是同一个fib_alias
    if (fa && fa->fa_tos == tos &&
        fa->fa_info->fib_priority == fi->fib_priority) {
        struct fib_alias *fa_first, *fa_match;

        err = -EEXIST;
        if (cfg->fc_nlflags & NLM_F_EXCL)
            goto out;

        /* We have 2 goals:
         * 1. Find exact match for type, scope, fib_info to avoid
         * duplicate routes
         * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
         */
        fa_match = NULL;
        fa_first = fa;
        hlist_for_each_entry_from(fa, fa_list) {
            if ((fa->fa_slen != slen) ||
                (fa->tb_id != tb->tb_id) ||
                (fa->fa_tos != tos))
                break;
            if (fa->fa_info->fib_priority != fi->fib_priority)
                break;
            if (fa->fa_type == cfg->fc_type &&
                fa->fa_info == fi) {
                fa_match = fa;
                break;
            }
        }
        // 插入的路由项和已有路由项完全相同,并且指定了替换标记,那么替换旧的路由项,完成插入过程

        if (cfg->fc_nlflags & NLM_F_REPLACE) {
            struct fib_info *fi_drop;
            u8 state;

            fa = fa_first;
            if (fa_match) {
                if (fa == fa_match)
                    err = 0;
                goto out;
            }
            err = -ENOBUFS;
            new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
            if (!new_fa)
                goto out;

            fi_drop = fa->fa_info;
            new_fa->fa_tos = fa->fa_tos;
            new_fa->fa_info = fi;
            new_fa->fa_type = cfg->fc_type;
            state = fa->fa_state;
            new_fa->fa_state = state & ~FA_S_ACCESSED;
            new_fa->fa_slen = fa->fa_slen;
            new_fa->tb_id = tb->tb_id;
            new_fa->fa_default = -1;

            err = switchdev_fib_ipv4_add(key, plen, fi,
                             new_fa->fa_tos,
                             cfg->fc_type,
                             cfg->fc_nlflags,
                             tb->tb_id);
            if (err) {
                switchdev_fib_ipv4_abort(fi);
                kmem_cache_free(fn_alias_kmem, new_fa);
                goto out;
            }

            hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);

            alias_free_mem_rcu(fa);

            fib_release_info(fi_drop);
            if (state & FA_S_ACCESSED)
                rt_cache_flush(cfg->fc_nlinfo.nl_net);
            rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);

            goto succeeded;
        }
        /* Error if we find a perfect match which
         * uses the same scope, type, and nexthop
         * information.
         */
        if (fa_match)
            goto out;

        if (cfg->fc_nlflags & NLM_F_APPEND)
            nlflags = NLM_F_APPEND;
        else
            fa = fa_first;
    }
    err = -ENOENT; // 路由项尚不存在,尝试新建它
    if (!(cfg->fc_nlflags & NLM_F_CREATE))
        goto out;
  
    err = -ENOBUFS;
    new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
    if (!new_fa)
        goto out;

    new_fa->fa_info = fi;
    new_fa->fa_tos = tos;
    new_fa->fa_type = cfg->fc_type;
    new_fa->fa_state = 0;
    new_fa->fa_slen = slen;
    new_fa->tb_id = tb->tb_id;
    new_fa->fa_default = -1;

    /* (Optionally) offload fib entry to switch hardware. */
    // Add/modify switch IPv4 route entry
    err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
                     cfg->fc_nlflags, tb->tb_id);
    if (err) {
        switchdev_fib_ipv4_abort(fi);
        goto out_free_new_fa;
    }

    /* Insert new entry to the list. */
    err = fib_insert_alias(t, tp, l, new_fa, fa, key);
    if (err)
        goto out_sw_fib_del;

    if (!plen)
        tb->tb_num_default++;
    // 刷新路由缓存
    rt_cache_flush(cfg->fc_nlinfo.nl_net);
     // 向用户态太发送路由添加Netlink消息
    rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
          &cfg->fc_nlinfo, nlflags);
succeeded:
    return 0;

out_sw_fib_del:
    switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
    kmem_cache_free(fn_alias_kmem, new_fa);
out:
    fib_release_info(fi);
err:
    return err;
}

 

其核心代码为fib_table_insert

主要逻辑有:

  • fib_create_info
  • fib_find_alias
  • switchdev_fib_ipv4_add
  • fib_insert_alias
  • rt_cache_flush

 目前路由采用trie 算法,trie是一种最长前缀匹配算法,对于大量路由的情况下查找效率高于hash,但是这种算法会消耗更多的内存、算法更加复杂;代码就不看了

 fib_create_info

内核数组fib_props指定了路由类型对应的最小的合法scope值,内核根据此数组进行判定。例如路由本地类型RTN_LOCAL的scope不能小于RT_SCOPE_HOST,否则就成为外部路由了。

之后,配置的路由标志不能包含RTNH_F_DEAD或者RTNH_F_LINKDOWN,这两个标志由内核自身使用

 

struct fib_info *fib_create_info(struct fib_config *cfg)
{
    int err;
    struct fib_info *fi = NULL;
    struct fib_info *ofi;
    int nhs = 1; // 不开启多路径路由时,每个路由项默认只有一个下一跳
    struct net *net = cfg->fc_nlinfo.nl_net;

    // 检查路由项的type和scope是否设置合理,比如type为RTN_LOCAL,表示路由项目的地址是一个本地地址,
    // 但是scope却指定为RT_SCOPE_UNIVERSE,显然是矛盾的,注意scope值越大,表示距离本机越远
    if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
        goto err_inval;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
    // 多路径路由的情况下,路由项可以有多个下一跳地址
    if (cfg->fc_mp) {
        nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
        if (nhs == 0)
            goto err_inval;
    }
#endif

    err = -ENOBUFS;
    if (fib_info_cnt >= fib_hash_size) {
        // 对哈希表扩容
        unsigned int new_size = fib_hash_size << 1;
        struct hlist_head *new_info_hash;
        struct hlist_head *new_laddrhash;
        unsigned int bytes;

        if (!new_size)
            new_size = 1;
        bytes = new_size * sizeof(struct hlist_head *);
        new_info_hash = fib_hash_alloc(bytes);
        new_laddrhash = fib_hash_alloc(bytes);
        if (!new_info_hash || !new_laddrhash) {
            fib_hash_free(new_info_hash, bytes);
            fib_hash_free(new_laddrhash, bytes);
        } else
            fib_hash_move(new_info_hash, new_laddrhash, new_size);

        if (!fib_hash_size)
            goto failure;
    }

    // 分配fib_info对象和下一跳地址结构,nhs是下一跳个数
    fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
    if (fi == NULL)
        goto failure;
    fib_info_cnt++; // 累加已分配路由项信息个数
    // 配置字段赋值
    fi->fib_net = hold_net(net);
    fi->fib_protocol = cfg->fc_protocol;
    fi->fib_flags = cfg->fc_flags;
    fi->fib_priority = cfg->fc_priority;
    fi->fib_prefsrc = cfg->fc_prefsrc;

    // 关联下一跳nh对象和fib_info对象
    fi->fib_nhs = nhs;
    change_nexthops(fi) {
        nh->nh_parent = fi;
    } endfor_nexthops(fi)

    if (cfg->fc_mx) {
        // 提取路由项的metric指标
        struct nlattr *nla;
        int remaining;

        nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
            int type = nla_type(nla);
            if (type) {
                if (type > RTAX_MAX)
                    goto err_inval;
                fi->fib_metrics[type - 1] = nla_get_u32(nla);
            }
        }
    }

    // 根据是否配置多路径设置下一跳信息
    if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
        if (err != 0)
            goto failure;
        if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
            goto err_inval;
        if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
            goto err_inval;
#ifdef CONFIG_NET_CLS_ROUTE
        if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
            goto err_inval;
#endif
#else
        goto err_inval;
#endif
    } else {
        // 只有一个下一跳的场景
        struct fib_nh *nh = fi->fib_nh;

        nh->nh_oif = cfg->fc_oif;
        nh->nh_gw = cfg->fc_gw;
        nh->nh_flags = cfg->fc_flags;
#ifdef CONFIG_NET_CLS_ROUTE
        nh->nh_tclassid = cfg->fc_flow;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        nh->nh_weight = 1;
#endif
    }

    if (fib_props[cfg->fc_type].error) {
        if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
            goto err_inval;
        goto link_it;
    }

    if (cfg->fc_scope > RT_SCOPE_HOST) // 配置目的地比本机还近的路由项是没有意义的
        goto err_inval;

    // 检查下一跳配置信息的有效性
    if (cfg->fc_scope == RT_SCOPE_HOST) {
        // 配置的是到达本机的路由,只能有一个下一跳,并且网络设备要存在
        struct fib_nh *nh = fi->fib_nh;

        /* Local address is added. */
        if (nhs != 1 || nh->nh_gw)
            goto err_inval;
        nh->nh_scope = RT_SCOPE_NOWHERE; // 修改scope
        nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); // 检查出口网络设备是否存在
        err = -ENODEV;
        if (nh->nh_dev == NULL)
            goto failure;
    } else {
        // 使用fib_check_nh()函数检查到达其它地址的下一跳是否有效
        change_nexthops(fi) {
            if ((err = fib_check_nh(cfg, fi, nh)) != 0)
                goto failure;
        } endfor_nexthops(fi)
    }
    // 检查优选源IP地址的有效性
    if (fi->fib_prefsrc) {
        // 对于目的地址非本机地址的路由,其源IP地址当然也不能只是仅主机范围内能够使用的IP地址
        if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fi->fib_prefsrc != cfg->fc_dst)
            if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
                goto err_inval;
    }

link_it:
    // 查找是否已经存在符合条件的路由项信息对象可以复用
    if ((ofi = fib_find_info(fi)) != NULL) {
        // 可以复用,增加已有路由项信息对象引用,释放新建的路由项信息对象,然后返回
        fi->fib_dead = 1;
        free_fib_info(fi);
        ofi->fib_treeref++;
        return ofi;
    }
    
    // 下面的逻辑对应新增一个路由项信息的场景

    fi->fib_treeref++;
    atomic_inc(&fi->fib_clntref);
    // 将新建的路由项信息对象加入全局的fib_info_hash哈希表
    spin_lock_bh(&fib_info_lock);
    hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]);
    if (fi->fib_prefsrc) {
        // 若指定了优选源IP地址,也将新建的路由项信息对象加入全局的fib_laddr_hashfn哈希表
        struct hlist_head *head;
        head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
        hlist_add_head(&fi->fib_lhash, head);
    }
    // 将该路由项信息中所有的下一跳对象保存到全局哈希表fib_info_devhash中
    change_nexthops(fi) {
        struct hlist_head *head;
        unsigned int hash;

        if (!nh->nh_dev)
            continue;
        hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
        head = &fib_info_devhash[hash];
        hlist_add_head(&nh->nh_hash, head);
    } endfor_nexthops(fi)
    spin_unlock_bh(&fib_info_lock);
    return fi;

err_inval:
    err = -EINVAL;

failure:
    if (fi) {
        fi->fib_dead = 1;
        free_fib_info(fi);
    }
    return ERR_PTR(err);
}
  1. 对配置的参数进行一系列的合法性检查;
  2. 参数都ok,那么如果该fib_info实例已经有了,则复用已有的,如果没有则用新的;
  3. 维护fib_info和fib_nh在全局链表中的数据结构;

查找路由项信息: fib_find_info()

该函数通过查找全局的fib_info_hash哈希表,判断其中是否已经有和参数相同的路由项信息对象,如果有那么返回哈希表中fib_info对象指针。

static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
    struct hlist_head *head;
    struct hlist_node *node;
    struct fib_info *fi;
    unsigned int hash;

    // 遍历冲突链
    hash = fib_info_hashfn(nfi);
    head = &fib_info_hash[hash];
    hlist_for_each_entry(fi, node, head, fib_hash) {
        if (fi->fib_net != nfi->fib_net) // 需要属于同一个网络命名空间
            continue;
        if (fi->fib_nhs != nfi->fib_nhs) // 下一跳个数要一致
            continue;
        // 如下字段也要相同,可见已经涵盖了路由项信息对象中的大多数字段
        if (nfi->fib_protocol == fi->fib_protocol &&
            nfi->fib_prefsrc == fi->fib_prefsrc &&
            nfi->fib_priority == fi->fib_priority &&
            memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
            ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
            (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) // 下一跳对象比较
            return fi;
    }

    return NULL;
}

 

 下一跳地址检查: fib_check_nh()

  • * a) gateway can be actually local interface address, so that gatewayed route is direct.
  • 网关可以是本地接口上的地址,所以目的网络是直连的。
  • * b) gateway must be on-link address, possibly described not by an ifaddr, but also by a direct route.
  • 网关必须是链路可达地址,可以不是和接口地址同网段,或是直连路由可达。
  • * c) If both gateway and interface are specified, they should not contradict.
  • 如果同时指定了网关和出接口,两者不能冲突。
  • * d) If we use tunnel routes, gateway could be not on-link.
  • 如果使用隧道路由,网关可以不是链路可达地址(需指定onlink命令字

 

/*
   Picture
   -------

   Semantics of nexthop is very messy by historical reasons.
   We have to take into account, that:
   a) gateway can be actually local interface address,
      so that gatewayed route is direct.
   b) gateway must be on-link address, possibly
      described not by an ifaddr, but also by a direct route.
   c) If both gateway and interface are specified, they should not
      contradict.
   d) If we use tunnel routes, gateway could be not on-link.

   Attempt to reconcile all of these (alas, self-contradictory) conditions
   results in pretty ugly and hairy code with obscure logic.

   I chose to generalized it instead, so that the size
   of code does not increase practically, but it becomes
   much more general.
   Every prefix is assigned a "scope" value: "host" is local address,
   "link" is direct route,
   [ ... "site" ... "interior" ... ]
   and "universe" is true gateway route with global meaning.

   Every prefix refers to a set of "nexthop"s (gw, oif),
   where gw must have narrower scope. This recursion stops
   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
   which means that gw is forced to be on link.

   Code is still hairy, but now it is apparently logically
   consistent and very flexible. F.e. as by-product it allows
   to co-exists in peace independent exterior and interior
   routing processes.

   Normally it looks as following.

   {universe prefix}  -> (gw, oif) [scope link]
              |
              |-> {link prefix} -> (gw, oif) [scope local]
                        |
                        |-> {local prefix} (terminal node)
 */
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh)
{
    int err;
    struct net *net;

    net = cfg->fc_nlinfo.nl_net;
    // 按照是否指定网关,分两种情况处理。一般来讲,只有配置非本地网络路由时才需要指定网关,
    // 因为这种情况需要明确告诉路由子系统,下一跳的IP地址是什么
    if (nh->nh_gw) {
        struct fib_result res;

#ifdef CONFIG_IP_ROUTE_PERVASIVE
        if (nh->nh_flags&RTNH_F_PERVASIVE)
            return 0;
#endif
        if (nh->nh_flags&RTNH_F_ONLINK) {
            struct net_device *dev;

            if (cfg->fc_scope >= RT_SCOPE_LINK)
                return -EINVAL;
            if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
                return -EINVAL;
            if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
                return -ENODEV;
            if (!(dev->flags&IFF_UP))
                return -ENETDOWN;
            nh->nh_dev = dev;
            dev_hold(dev);
            nh->nh_scope = RT_SCOPE_LINK;
            return 0;
        }
        {
            struct flowi fl = {
                .nl_u = {
                    .ip4_u = {
                        .daddr = nh->nh_gw,
                        .scope = cfg->fc_scope + 1,
                    },
                },
                .oif = nh->nh_oif,
            };

            /* It is not necessary, but requires a bit of thinking */
            if (fl.fl4_scope < RT_SCOPE_LINK)
                fl.fl4_scope = RT_SCOPE_LINK;
            if ((err = fib_lookup(net, &fl, &res)) != 0)
                return err;
        }
        err = -EINVAL;
        if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
            goto out;
        nh->nh_scope = res.scope;
        nh->nh_oif = FIB_RES_OIF(res);
        if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
            goto out;
        dev_hold(nh->nh_dev);
        err = -ENETDOWN;
        if (!(nh->nh_dev->flags & IFF_UP))
            goto out;
        err = 0;
out:
        fib_res_put(&res);
        return err;
    } else {
        // 没有指定网关场景的下一跳地址检查
        struct in_device *in_dev;

        if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
            return -EINVAL;
        // 出口网络设备必须是UP状态
        in_dev = inetdev_by_index(net, nh->nh_oif);
        if (in_dev == NULL)
            return -ENODEV;
        if (!(in_dev->dev->flags&IFF_UP)) {
            in_dev_put(in_dev);
            return -ENETDOWN;
        }
        nh->nh_dev = in_dev->dev;
        dev_hold(nh->nh_dev);
        nh->nh_scope = RT_SCOPE_HOST;
        in_dev_put(in_dev);
    }
    return 0;
}

 

查找路由项: fib_find_alias()

该函数的实现,使用查找结果时要求tos和priority都相等,为何查找时不直接精确匹配。

从实现看,该函数返回的是fib_alias对象需要满足:

  在给定tries树节点的fib_alias链表(fah)中查找匹配的项,匹配项的后缀长度需要与slen相等,表ID也要相等。后缀长度fa_slen在链表中是由小到大排列的。反过来讲,fib_alias的前缀长度由大到小排列,即最长匹配的项在最前。相反的,tb_id在链表中是按照由大到小排列的。另外,tos值也是按照由大到小排列。
  如果设置了查找第一个fib_alias,返回此时正在遍历的项。否则,找到TOS值小于给定tos值的项;或者两者tos值相等,但是优先级priority大于等于给定prio的项。
  新的fib_alias要插入到找到的fib_alias之前,参见函数fib_insert_alias。

/* Return the first fib alias matching TOS with
 * priority less than or equal to PRIO.
 */
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
                    u8 tos, u32 prio, u32 tb_id)
{
    struct fib_alias *fa;

    if (!fah)
        return NULL;

    hlist_for_each_entry(fa, fah, fa_list) {
        if (fa->fa_slen < slen)
            continue;
        if (fa->fa_slen != slen)
            break;
        if (fa->tb_id > tb_id)
            continue;
        if (fa->tb_id != tb_id)
            break;
        if (fa->fa_tos > tos)
            continue;
        if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
            return fa;
    }

    return NULL;
}

添加fib_alias:

fib_alias用于连接tries树节点和基础的fib_info结构。

  如果在tries树中没有找到可插入fib_alias的叶子节点,即节点l为空,使用函数fib_insert_node添加节点以及相关联的新的fib_alias结构(参见 IPv4路由tries树节点添加与查找 https://redwingz.blog.csdn.net/article/details/121002339 )。否则,如果在tries树中找到匹配的叶子节点,并且在此叶子节点的fib_alias链表中遍历找到了合适的插入位置,即参数fa(参见以下函数fib_find_alias,),将新的fib_alias插入到匹配项fa之前。
  如果没有合适的插入点(fa为空),放宽查找条件,再次遍历叶子节点l的fib_alias链表,查找合适的节点,但是新的fib_alias要插入到找到节点之后。匹配条件变为:1)新fib_alias的后缀长度要大于等于遍历项的后缀长度fa_slen,反过来讲,就是在此链表中前缀长度按照由大到小排列;2)并且如果两者后缀长度相等,新的fib_alias的表ID要小于等于遍历项的表ID的项,也就是tb_id在链表中是按照由大到小排列。
如果以上遍历,找到合适的fib_alias,将新的fib_alias添加到此项之后。否者,没有匹配项时,将新的fib_alias添加到叶子节点l的首部。

static int fib_insert_alias(struct trie *t, struct key_vector *tp,
                struct key_vector *l, struct fib_alias *new,
                struct fib_alias *fa, t_key key)
{
    if (!l)//使用函数fib_insert_node添加节点以及相关联的新的fib_alias结构
        return fib_insert_node(t, tp, new, key);

    if (fa) {//并且在此叶子节点的fib_alias链表中遍历找到了合适的插入位置
        hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
    } else {
    //放宽查找条件,再次遍历叶子节点l的fib_alias链表,查找合适的节点,但是新的fib_alias要插入到找到节点之后
        struct fib_alias *last;

        hlist_for_each_entry(last, &l->leaf, fa_list) {
            //此链表中前缀长度按照由大到小排列
            if (new->fa_slen < last->fa_slen)//新fib_alias的后缀长度要大于等于遍历项的后缀长度fa_slen
                break;
            //如果两者后缀长度相等,新的fib_alias的表ID要小于等于遍历项的表ID的项 
            //tb_id在链表中是按照由大到小排列。
            if ((new->fa_slen == last->fa_slen) &&
                (new->tb_id > last->tb_id))
                break;
            fa = last;
        }
        //找到合适的fib_alias,将新的fib_alias添加到此项之后。否者,没有匹配项时,将新的fib_alias添加到叶子节点l的首部
        if (fa)
            hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
        else
            hlist_add_head_rcu(&new->fa_list, &l->leaf);
    }

    /* if we added to the tail node then we need to update slen
    后缀长度由小到大排列,如果新的fib_alias添加到了链表末尾,表明节点l的后缀长度小于新fib_alias的后缀长度,进行更新*/
    if (l->slen < new->fa_slen) {
        l->slen = new->fa_slen;
        leaf_push_suffix(tp, l);
    }

    return 0;
}
ip route show scope global table 100
default via 203.0.113.5 dev out2
192.0.2.0/25
        nexthop via 203.0.113.7  dev out3 weight 1
        nexthop via 203.0.113.9  dev out4 weight 1
192.0.2.47 via 203.0.113.3 dev out1
192.0.2.48 via 203.0.113.3 dev out1
192.0.2.49 via 203.0.113.3 dev out1
192.0.2.50 via 203.0.113.3 dev out1

 

 来自:https://vincent.bernat.ch/en/blog/2017-ipv4-route-lookup-linux

Here are some examples of lookups and the associated results:

 

Destination IPNext hop
192.0.2.49 203.0.113.3 via out1
192.0.2.50 203.0.113.3 via out1
192.0.2.51 203.0.113.7 via out3 or 203.0.113.9 via out4 (ECMP)
192.0.2.200 203.0.113.5 via out2

 

 

 

 

 

 

 

cat /proc/net/fib_trie
Id 100:
  +-- 0.0.0.0/0 2 0 2
     |-- 0.0.0.0
        /0 universe UNICAST
     +-- 192.0.2.0/26 2 0 1
        |-- 192.0.2.0
           /25 universe UNICAST
        |-- 192.0.2.47
           /32 universe UNICAST
        +-- 192.0.2.48/30 2 0 1
           |-- 192.0.2.48
              /32 universe UNICAST
           |-- 192.0.2.49
              /32 universe UNICAST
           |-- 192.0.2.50
              /32 universe UNICAST
[…]

 

 

 

 

 

 

posted @ 2022-03-05 12:58  codestacklinuxer  阅读(132)  评论(0)    收藏  举报