当前设备的接口ens32的ip地址为192.168.1.109,当前的默认路由为192.168.1.1。
# ip address
2: ens32: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
link/ether 00:0c:29:a0:c2:ff brd ff:ff:ff:ff:ff:ff
inet 192.168.1.109/24 brd 192.168.1.255 scope global dynamic ens32
#
# ip route
default via 192.168.1.1 dev ens32 proto dhcp src 192.168.1.109 metric 100
192.168.1.0/24 dev ens32 proto kernel scope link src 192.168.1.109
如下,增加一个到主机192.168.1.102的路由,网关设置为192.168.1.1。但是明显的192.168.1.102就在本地链路上,可直达,不需要经过192.168.1.1。
# ip route add 192.168.1.102/32 via 192.168.1.1
#
# ping 192.168.1.102
PING 192.168.1.102 (192.168.1.102) 56(84) bytes of data.
From 192.168.1.1: icmp_seq=1 Redirect Host(New nexthop: 192.168.1.102)
64 bytes from 192.168.1.102: icmp_seq=1 ttl=64 time=0.472 ms
^C
--- 192.168.1.102 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1023ms
rtt min/avg/max/mdev = 0.229/0.350/0.472/0.122 ms
网关192.168.1.1回复了重定向ICMP报文,之后的PING报文直接发送到192.168.1.102。
# ip -s route get 192.168.1.102
192.168.1.102 via 192.168.1.102 dev ens160 src 192.168.1.132 uid 1000
cache <redirected> expires 279sec users 2 age 19sec
# ip route show table cache
#
# ip route show cached
重定向发送
在生成路由项时,如果检测到报文的入口和出口设备相同,并且设备允许发送重定向报文,对于共享类型的链路,如果源地址和网关在同一链路上,设置重定向标志(IPSKB_DOREDIRECT)。
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
struct fib_nh_common *nhc = FIB_RES_NHC(*res);
struct net_device *dev = nhc->nhc_dev;
struct fib_nh_exception *fnhe;
if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
skb->protocol == htons(ETH_P_IP)) {
__be32 gw;
gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
if (IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, gw))
IPCB(skb)->flags |= IPSKB_DOREDIRECT;
在转发函数中,发送ICMP重定向报文。
int ip_forward(struct sk_buff *skb)
{
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
重定向接收
对于code为ICMP_REDIR_HOST的ICMP重定向报文,如果其指定的新的网关(192.168.1.102)可达,邻居表存在。在路由存在的情况下,由函数update_or_create_fnhe更新或者创建一个exception项,超时时长设置为ip_rt_gc_timeout,默认为300秒,可通过PROC文件gc_timeout修改。
static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route)
{
__be32 new_gw = icmp_hdr(skb)->un.gateway;
struct fib_result res;
struct neighbour *n;
n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
if (!n)
n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
if (!IS_ERR(n)) {
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh_common *nhc;
fib_select_path(net, &res, fl4, skb);
nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, new_gw,
0, false, jiffies + ip_rt_gc_timeout);
}
对于下一跳exception存在的情况,更新网关值以及fnhe_expires超时值。并且更新相应的路由缓存,由函数fill_route_from_fnhe实现。
static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
__be32 gw, u32 pmtu, bool lock, unsigned long expires)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
if (fnhe) {
if (fnhe->fnhe_genid != genid)
fnhe->fnhe_genid = genid;
if (gw)
fnhe->fnhe_gw = gw;
fnhe->fnhe_expires = max(1UL, expires);
/* Update all cached dsts too */
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt)
fill_route_from_fnhe(rt, fnhe);
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt)
fill_route_from_fnhe(rt, fnhe);
} else {
否则,下一跳exception不存在的情况下,分配新的fnhe,初始化相应值,并且,如果之前存在入口或者出口路由缓存,将其设置为过期状态(DST_OBSOLETE_KILL)。
if (depth > FNHE_RECLAIM_DEPTH)
fnhe = fnhe_oldest(hash);
else {
fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
if (!fnhe)
goto out_unlock;
fnhe->fnhe_next = hash->chain;
rcu_assign_pointer(hash->chain, fnhe);
}
fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_mtu_locked = lock;
fnhe->fnhe_expires = max(1UL, expires);
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception applies to them.
*/
rt = rcu_dereference(nhc->nhc_rth_input);
if (rt)
rt->dst.obsolete = DST_OBSOLETE_KILL;
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
rt->dst.obsolete = DST_OBSOLETE_KILL;
}
}
fnhe->fnhe_stamp = jiffies;
重定向生成的路由项设置RTCF_REDIRECTED标志。
static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
rt->rt_pmtu = fnhe->fnhe_pmtu;
rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
rt->dst.expires = fnhe->fnhe_expires;
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
rt->rt_uses_gateway = 1;
rt->rt_gw_family = AF_INET;
rt->rt_gw4 = fnhe->fnhe_gw;
cache信息
于路由缓存项中取得过期时间戳,如果其不为零,并且还没有到过期时刻,记算还有多次时间过期,否则将其设置为零。
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags)
{
r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
r->rtm_flags |= RTCF_DOREDIRECT;
expires = rt->dst.expires;
if (expires) {
unsigned long now = jiffies;
if (time_before(now, expires))
expires -= now;
else
expires = 0;
}
error = rt->dst.error;
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
成员变量rta_error为rt->dst.error的值,而rta_id固定为零。lastuse和__use分别表示此路由缓存最后一次使用时的时间戳和使用的次数。变量__refcnt为引用计数。rta_expires记录路由超时时长。
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
long expires, u32 error)
{
struct rta_cacheinfo ci = {
.rta_error = error,
.rta_id = id,
};
if (dst) {
ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
ci.rta_used = dst->__use;
ci.rta_clntref = atomic_read(&dst->__refcnt);
}
if (expires) {
unsigned long clock;
clock = jiffies_to_clock_t(abs(expires));
clock = min_t(unsigned long, clock, INT_MAX);
ci.rta_expires = (expires > 0) ? clock : -clock;
}
return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
iproute2显示cache信息
在iproute2中由函数print_route显示路由信息。对于IPv4和IPv6的RTA_CACHEINFO属性,都由函数print_rta_cacheinfo显示。
int print_route(struct nlmsghdr *n, void *arg)
{
if (r->rtm_family == AF_INET) {
if (r->rtm_flags & RTM_F_CLONED)
print_cache_flags(fp, r->rtm_flags);
if (tb[RTA_CACHEINFO])
print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
} else if (r->rtm_family == AF_INET6) {
if (tb[RTA_CACHEINFO])
print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
}
如下,简单的显示。
static void print_rta_cacheinfo(FILE *fp, const struct rta_cacheinfo *ci)
{
static int hz;
if (!hz)
hz = get_user_hz();
if (ci->rta_expires != 0)
print_int(PRINT_ANY, "expires",
"expires %dsec ", ci->rta_expires/hz);
if (ci->rta_error != 0)
print_uint(PRINT_ANY, "error",
"error %u ", ci->rta_error);
if (show_stats) {
if (ci->rta_clntref)
print_uint(PRINT_ANY, "users",
"users %u ", ci->rta_clntref);
if (ci->rta_used != 0)
print_uint(PRINT_ANY, "used",
"used %u ", ci->rta_used);
if (ci->rta_lastuse != 0)
print_uint(PRINT_ANY, "age",
"age %usec ", ci->rta_lastuse/hz);
}
if (ci->rta_id)
print_0xhex(PRINT_ANY, "ipid",
"ipid 0x%04llx ", ci->rta_id);
if (ci->rta_ts || ci->rta_tsage) {
print_0xhex(PRINT_ANY, "ts",
"ts 0x%llx", ci->rta_ts);
print_uint(PRINT_ANY, "tsage",
超时处理
下一跳exception默认使用超时时长为300秒(RT_GC_TIMEOUT)。
118 #define RT_GC_TIMEOUT (300*HZ)
130 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
在函数find_exception中,如果检测到exception的超时时间(fnhe_expires)已过,由函数ip_del_fnhe删除exception。
static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr)
{
struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
struct fib_nh_exception *fnhe;
u32 hval;
if (!hash)
return NULL;
hval = fnhe_hashfun(daddr);
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) {
if (fnhe->fnhe_daddr == daddr) {
if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) {
ip_del_fnhe(nhc, daddr);
break;
}
return fnhe;
遍历exceptions的链表,找到目的地址相同的项,由函数fnhe_flush_routes释放其对应的路由缓存,随后释放exceptions自身。
static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe, __rcu **fnhe_p;
u32 hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock));
hash += hval;
fnhe_p = &hash->chain;
fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
while (fnhe) {
if (fnhe->fnhe_daddr == daddr) {
rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
fnhe->fnhe_daddr = 0;
fnhe_flush_routes(fnhe);
kfree_rcu(fnhe, rcu);
break;
}
fnhe_p = &fnhe->fnhe_next;
fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock));
内核版本 5.10