IPv4路由exception缓存

Source

当前设备的接口ens32的ip地址为192.168.1.109,当前的默认路由为192.168.1.1。

# ip address
2: ens32: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
    link/ether 00:0c:29:a0:c2:ff brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.109/24 brd 192.168.1.255 scope global dynamic ens32
#
# ip route
default via 192.168.1.1 dev ens32 proto dhcp src 192.168.1.109 metric 100 
192.168.1.0/24 dev ens32 proto kernel scope link src 192.168.1.109 

如下,增加一个到主机192.168.1.102的路由,网关设置为192.168.1.1。但是明显的192.168.1.102就在本地链路上,可直达,不需要经过192.168.1.1。

# ip route add 192.168.1.102/32 via 192.168.1.1 
# 
# ping 192.168.1.102
PING 192.168.1.102 (192.168.1.102) 56(84) bytes of data.
From 192.168.1.1: icmp_seq=1 Redirect Host(New nexthop: 192.168.1.102)
64 bytes from 192.168.1.102: icmp_seq=1 ttl=64 time=0.472 ms
^C
--- 192.168.1.102 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1023ms
rtt min/avg/max/mdev = 0.229/0.350/0.472/0.122 ms

网关192.168.1.1回复了重定向ICMP报文,之后的PING报文直接发送到192.168.1.102。

# ip -s route get 192.168.1.102 
192.168.1.102 via 192.168.1.102 dev ens160 src 192.168.1.132 uid 1000 
    cache <redirected> expires 279sec users 2 age 19sec 
# ip route show table cache 
# 
# ip route show cached   

重定向发送

在生成路由项时,如果检测到报文的入口和出口设备相同,并且设备允许发送重定向报文,对于共享类型的链路,如果源地址和网关在同一链路上,设置重定向标志(IPSKB_DOREDIRECT)。

static int __mkroute_input(struct sk_buff *skb,
               const struct fib_result *res,
               struct in_device *in_dev,
               __be32 daddr, __be32 saddr, u32 tos)
{
    struct fib_nh_common *nhc = FIB_RES_NHC(*res);
    struct net_device *dev = nhc->nhc_dev;
    struct fib_nh_exception *fnhe;

    if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
        skb->protocol == htons(ETH_P_IP)) {
        __be32 gw;

        gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
        if (IN_DEV_SHARED_MEDIA(out_dev) ||
            inet_addr_onlink(out_dev, saddr, gw))
            IPCB(skb)->flags |= IPSKB_DOREDIRECT;

在转发函数中,发送ICMP重定向报文。

int ip_forward(struct sk_buff *skb)
{
    /*
     *  We now generate an ICMP HOST REDIRECT giving the route
     *  we calculated.
     */
    if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
        !skb_sec_path(skb))
        ip_rt_send_redirect(skb);

重定向接收

对于code为ICMP_REDIR_HOST的ICMP重定向报文,如果其指定的新的网关(192.168.1.102)可达,邻居表存在。在路由存在的情况下,由函数update_or_create_fnhe更新或者创建一个exception项,超时时长设置为ip_rt_gc_timeout,默认为300秒,可通过PROC文件gc_timeout修改。

static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route)
{
    __be32 new_gw = icmp_hdr(skb)->un.gateway;
    struct fib_result res;
    struct neighbour *n;

    n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
    if (!n)
        n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
    if (!IS_ERR(n)) {
        if (!(n->nud_state & NUD_VALID)) {
            neigh_event_send(n, NULL);
        } else {
            if (fib_lookup(net, fl4, &res, 0) == 0) {
                struct fib_nh_common *nhc;

                fib_select_path(net, &res, fl4, skb);
                nhc = FIB_RES_NHC(res);
                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
                        0, false, jiffies + ip_rt_gc_timeout);
            }

对于下一跳exception存在的情况,更新网关值以及fnhe_expires超时值。并且更新相应的路由缓存,由函数fill_route_from_fnhe实现。

static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
                  __be32 gw, u32 pmtu, bool lock, unsigned long expires)
{
    struct fnhe_hash_bucket *hash;
    struct fib_nh_exception *fnhe;

    if (fnhe) {
        if (fnhe->fnhe_genid != genid)
            fnhe->fnhe_genid = genid;
        if (gw)
            fnhe->fnhe_gw = gw;

        fnhe->fnhe_expires = max(1UL, expires);
        /* Update all cached dsts too */
        rt = rcu_dereference(fnhe->fnhe_rth_input);
        if (rt)
            fill_route_from_fnhe(rt, fnhe);
        rt = rcu_dereference(fnhe->fnhe_rth_output);
        if (rt)
            fill_route_from_fnhe(rt, fnhe);
    } else {

否则,下一跳exception不存在的情况下,分配新的fnhe,初始化相应值,并且,如果之前存在入口或者出口路由缓存,将其设置为过期状态(DST_OBSOLETE_KILL)。

        if (depth > FNHE_RECLAIM_DEPTH)
            fnhe = fnhe_oldest(hash);
        else {
            fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
            if (!fnhe)
                goto out_unlock;

            fnhe->fnhe_next = hash->chain;
            rcu_assign_pointer(hash->chain, fnhe);
        }
        fnhe->fnhe_genid = genid;
        fnhe->fnhe_daddr = daddr;
        fnhe->fnhe_gw = gw;
        fnhe->fnhe_pmtu = pmtu;
        fnhe->fnhe_mtu_locked = lock;
        fnhe->fnhe_expires = max(1UL, expires);

        /* Exception created; mark the cached routes for the nexthop
         * stale, so anyone caching it rechecks if this exception applies to them.
         */
        rt = rcu_dereference(nhc->nhc_rth_input);
        if (rt)
            rt->dst.obsolete = DST_OBSOLETE_KILL;

        for_each_possible_cpu(i) {
            struct rtable __rcu **prt;
            prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
            rt = rcu_dereference(*prt);
            if (rt)
                rt->dst.obsolete = DST_OBSOLETE_KILL;
        }
    }
    fnhe->fnhe_stamp = jiffies;

重定向生成的路由项设置RTCF_REDIRECTED标志。

static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
    rt->rt_pmtu = fnhe->fnhe_pmtu;
    rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
    rt->dst.expires = fnhe->fnhe_expires;

    if (fnhe->fnhe_gw) {
        rt->rt_flags |= RTCF_REDIRECTED;
        rt->rt_uses_gateway = 1;
        rt->rt_gw_family = AF_INET;
        rt->rt_gw4 = fnhe->fnhe_gw;

cache信息

于路由缓存项中取得过期时间戳,如果其不为零,并且还没有到过期时刻,记算还有多次时间过期,否则将其设置为零。

static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
            struct rtable *rt, u32 table_id, struct flowi4 *fl4,
            struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags)
{
    r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
    if (rt->rt_flags & RTCF_NOTIFY)
        r->rtm_flags |= RTM_F_NOTIFY;
    if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
        r->rtm_flags |= RTCF_DOREDIRECT;

    expires = rt->dst.expires;
    if (expires) {
        unsigned long now = jiffies;

        if (time_before(now, expires))
            expires -= now;
        else
            expires = 0;
    }

    error = rt->dst.error;

    if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
        goto nla_put_failure;

成员变量rta_error为rt->dst.error的值,而rta_id固定为零。lastuse和__use分别表示此路由缓存最后一次使用时的时间戳和使用的次数。变量__refcnt为引用计数。rta_expires记录路由超时时长。

int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
               long expires, u32 error)
{   
    struct rta_cacheinfo ci = {
        .rta_error = error,
        .rta_id =  id,
    };      
    
    if (dst) {
        ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
        ci.rta_used = dst->__use;
        ci.rta_clntref = atomic_read(&dst->__refcnt);
    }
    if (expires) {
        unsigned long clock;
        
        clock = jiffies_to_clock_t(abs(expires));
        clock = min_t(unsigned long, clock, INT_MAX);
        ci.rta_expires = (expires > 0) ? clock : -clock;
    }
    return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);

iproute2显示cache信息

在iproute2中由函数print_route显示路由信息。对于IPv4和IPv6的RTA_CACHEINFO属性,都由函数print_rta_cacheinfo显示。

int print_route(struct nlmsghdr *n, void *arg)
{
    if (r->rtm_family == AF_INET) {
        if (r->rtm_flags & RTM_F_CLONED)
            print_cache_flags(fp, r->rtm_flags);

        if (tb[RTA_CACHEINFO])
            print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
    } else if (r->rtm_family == AF_INET6) {
        if (tb[RTA_CACHEINFO])
            print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
    }

如下,简单的显示。

static void print_rta_cacheinfo(FILE *fp, const struct rta_cacheinfo *ci)
{   
    static int hz;
    
    if (!hz) 
        hz = get_user_hz();
    
    if (ci->rta_expires != 0)
        print_int(PRINT_ANY, "expires",
               "expires %dsec ", ci->rta_expires/hz);
    if (ci->rta_error != 0)
        print_uint(PRINT_ANY, "error",
               "error %u ", ci->rta_error);
    
    if (show_stats) {
        if (ci->rta_clntref)
            print_uint(PRINT_ANY, "users",
                   "users %u ", ci->rta_clntref);
        if (ci->rta_used != 0)
            print_uint(PRINT_ANY, "used",
                   "used %u ", ci->rta_used);
        if (ci->rta_lastuse != 0) 
            print_uint(PRINT_ANY, "age",
                   "age %usec ", ci->rta_lastuse/hz);
    }
    if (ci->rta_id)
        print_0xhex(PRINT_ANY, "ipid",
                "ipid 0x%04llx ", ci->rta_id);
    if (ci->rta_ts || ci->rta_tsage) {
        print_0xhex(PRINT_ANY, "ts",
                "ts 0x%llx", ci->rta_ts);
        print_uint(PRINT_ANY, "tsage",

超时处理

下一跳exception默认使用超时时长为300秒(RT_GC_TIMEOUT)。

 118 #define RT_GC_TIMEOUT (300*HZ)

 130 static int ip_rt_gc_timeout __read_mostly   = RT_GC_TIMEOUT;

在函数find_exception中,如果检测到exception的超时时间(fnhe_expires)已过,由函数ip_del_fnhe删除exception。

static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr)
{
    struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
    struct fib_nh_exception *fnhe;
    u32 hval;

    if (!hash)
        return NULL;

    hval = fnhe_hashfun(daddr);

    for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
         fnhe = rcu_dereference(fnhe->fnhe_next)) {
        if (fnhe->fnhe_daddr == daddr) {
            if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) {
                ip_del_fnhe(nhc, daddr);
                break;
            }
            return fnhe;

遍历exceptions的链表,找到目的地址相同的项,由函数fnhe_flush_routes释放其对应的路由缓存,随后释放exceptions自身。

static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
    struct fnhe_hash_bucket *hash;
    struct fib_nh_exception *fnhe, __rcu **fnhe_p;
    u32 hval = fnhe_hashfun(daddr);

    spin_lock_bh(&fnhe_lock);

    hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock));
    hash += hval;

    fnhe_p = &hash->chain;
    fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
    while (fnhe) {
        if (fnhe->fnhe_daddr == daddr) {
            rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
            fnhe->fnhe_daddr = 0;
            fnhe_flush_routes(fnhe);
            kfree_rcu(fnhe, rcu);
            break;
        }
        fnhe_p = &fnhe->fnhe_next;
        fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock));

内核版本 5.10