Linux 内核版本:5.4
1. LC-trie 数据结构
Trie 树,又称字典树,是一种树形结构,可以用于保存字符串集合,并可在此集合中快速查找字符串。Trie 树中,从根节点到叶子节点的一条路径就代表一个字符串,从根节点到分支节点(非叶子节点)的一条路径就代表某个字符串的前缀。ipv4 网络地址可以看成 32 位二进制位,也即长度为 32 的 01 字符串。例如,对于 01 字符串集合:{000, 001, 0101, 10, 110}
,普通的二叉 trie 树如下图所示:
LC-trie 是经过路径压缩和层级压缩的 trie 树。
- 先对 trie 树进行路径压缩:除去树中度为 1 的节点。为了保证实现正确的匹配,需要在相应的位置保存除去的子串。
- 再对 trie 树进行层级压缩:自上而下地遍历二叉树,对于每个节点,找以其为根节点的最大的满子树,设其高度为 h,将该部分子树替换为高度为 1 的 $2^h$ 叉树。
2. Linux路由表
2.1 路由表组织
基于 LC-trie 的 Linux 路由表的组织结构如下图所示:
上图中,fib_table_hash
表示路由表哈希数组,哈希值就是路由表 ID,每个路由表都由一个fib_table
结构体表示,这个结构体尾部存放一个占位指针,用来指向路由 trie 树。
struct fib_table {
struct hlist_node tb_hlist;
u32 tb_id;
int tb_num_default;
struct rcu_head rcu;
unsigned long *tb_data;
unsigned long __data[0];
};
Trie 树中的节点都包含一个key_vector
结构体,该结构体包含以下信息:
-
key 为节点的关键字,前缀相同的节点存储在该节点下
-
pos、bits 用于子节点的索引,
key[pos, pos + bits]
为子节点的索引。共有 bits 位二进制用于索引子节点,也就是该节点下能存储个子节点。leaf/tnode 用于指向叶子节点的路由信息或者子节点的信息,通过IS_LEAF
宏来判断该节点是否是叶子节点,也就是 bits 为 0 的时候,该节点是叶子节点,否则是中间节点,该节点下存储其他子节点。
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
- slen 为后缀长度
struct key_vector {
t_key key;
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen;
union {
/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
struct hlist_head leaf;
/* This array is valid if (pos | bits) > 0 (TNODE) */
struct key_vector __rcu *tnode[0];
};
};
fib_info
结构对应路由表的路由信息字段。其中保存了相应路由的优先级、下一跳地址、转发数据包的网路设备等重要信息。fib_info
结构可能占用较多的空间。假设需要为同一个目标地址创建多个路由表项,这些路由表项的唯一差别为服务类型(TOS),若采用创建多个fib_info
结构的方式,可能带来很大的内存开销。
于是fib_alias
结构应运而生,fib_alias
用于连接 trie 树叶子节点和fib_info
结构。同一个目标地址的所有fib_alias
结构被组织成链表,每个fib_alias
结构中保存了服务类型、指向对应fib_info
的指针等信息。
fib_nh
结构中保存了下一跳路由的地址。
struct fib_alias {
struct hlist_node fa_list;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
u8 fa_state;
u8 fa_slen;
u32 tb_id;
s16 fa_default;
struct rcu_head rcu;
};
struct fib_info {
struct hlist_node fib_hash;
struct hlist_node fib_lhash;
struct list_head nh_list;
struct net *fib_net;
int fib_treeref;
refcount_t fib_clntref;
unsigned int fib_flags;
unsigned char fib_dead;
unsigned char fib_protocol;
unsigned char fib_scope;
unsigned char fib_type;
__be32 fib_prefsrc;
u32 fib_tb_id;
u32 fib_priority;
struct dst_metrics *fib_metrics;
#define fib_mtu fib_metrics->metrics[RTAX_MTU-1]
#define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
#define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
int fib_nhs;
bool fib_nh_is_v6;
bool nh_updated;
struct nexthop *nh;
struct rcu_head rcu;
struct fib_nh fib_nh[0];
};
struct fib_nh {
struct fib_nh_common nh_common;
struct hlist_node nh_hash;
struct fib_info *nh_parent;
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 nh_tclassid;
#endif
__be32 nh_saddr;
int nh_saddr_genid;
#define fib_nh_family nh_common.nhc_family
#define fib_nh_dev nh_common.nhc_dev
#define fib_nh_oif nh_common.nhc_oif
#define fib_nh_flags nh_common.nhc_flags
#define fib_nh_lws nh_common.nhc_lwtstate
#define fib_nh_scope nh_common.nhc_scope
#define fib_nh_gw_family nh_common.nhc_gw_family
#define fib_nh_gw4 nh_common.nhc_gw.ipv4
#define fib_nh_gw6 nh_common.nhc_gw.ipv6
#define fib_nh_weight nh_common.nhc_weight
#define fib_nh_upper_bound nh_common.nhc_upper_bound
};
2.2 路由项插入
static int fib_insert_node(struct trie *t, struct key_vector *tp,
struct fib_alias *new, t_key key)
{
struct key_vector *n, *l;
l = leaf_new(key, new);
if (!l)
goto noleaf;
/* retrieve child from parent node */
n = get_child(tp, get_index(key, tp));
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
* Add a new tnode here
* first tnode need some special handling
* leaves us in position for handling as case 3
*/
if (n) {
struct key_vector *tn;
tn = tnode_new(key, __fls(key ^ n->key), 1);
if (!tn)
goto notnode;
/* initialize routes out of node */
NODE_INIT_PARENT(tn, tp);
put_child(tn, get_index(key, tn) ^ 1, n);
/* start adding routes into the node */
put_child_root(tp, key, tn);
node_set_parent(n, tn);
/* parent now has a NULL spot where the leaf can go */
tp = tn;
}
/* Case 3: n is NULL, and will just insert a new leaf */
node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
return 0;
notnode:
node_free(l);
noleaf:
return -ENOMEM;
}
2.3 路由项查找
static struct key_vector *fib_find_node(struct trie *t,
struct key_vector **tp, u32 key)
{
struct key_vector *pn, *n = t->kv;
unsigned long index = 0;
do {
pn = n;
n = get_child_rcu(n, index);
if (!n)
break;
index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
* if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
*
* This check is safe even if bits == KEYLENGTH due to the
* fact that we can only allocate a node with 32 bits if a
* long is greater than 32 bits.
*/
if (index >= (1ul << n->bits)) {
n = NULL;
break;
}
/* keep searching until we find a perfect match leaf or NULL */
} while (IS_TNODE(n));
*tp = pn;
return n;
}