Commit ffe95ecf authored by David S. Miller's avatar David S. Miller

Merge branch 'net-remove-dst-garbage-collector-logic'

Wei Wang says:

====================
remove dst garbage collector logic

The current mechanism of dst release is a bit complicated. It is because
the users of dst get divided into 2 situations:
  1. Most users take the reference count when using a dst and release the
     reference count when done.
  2. Exceptional users like IPv4/IPv6/decnet/xfrm routing code do not take
     reference count when referencing to a dst due to some histotic reasons.

Due to those exceptional use cases in 2, reference count being 0 is not an
adequate evidence to indicate that no user is using this dst. So users in 1
can't free the dst simply based on reference count being 0 because users in
2 might still hold reference to it.
Instead, a dst garbage list is needed to hold the dst entries that already
get removed by the users in 2 but are still held by users in 1. And a periodic
garbage collector task is run to check all the dst entries in the list to see
if the users in 1 have released the reference to those dst entries.
If so, the dst is now ready to be freed.

This logic introduces unnecessary complications in the dst code which makes it
hard to understand and to debug.

In order to get rid of the whole dst garbage collector (gc) and make the dst
code more unified and simplified, we can make the users in 2 also take reference
count on the dst and release it properly when done.
This way, dst can be safely freed once the refcount drops to 0 and no gc
thread is needed anymore.

This patch series' target is to completely get rid of dst gc logic and free
dst based on reference count only.
Patch 1-3 are preparation patches to do some cleanup/improvement on the existing
code to make later work easier.
Patch 4-21 are real implementations.
In these patches, a temporary flag DST_NOGC is used to help transition
those exceptional users one by one. Once every component is transitioned,
this temporary flag is removed.
By the end of this patch series, all dst are refcounted when being used
and released when done. And dst will be freed when its refcount drops to 0.
No dst gc task is running anymore.

Note: This patch series depends on the decnet fix that was sent right before:
      "decnet: always not take dst->__refcnt when inserting dst into hash table"

v2:
  add curly braces in udp_v4/6_early_demux() in patch 02
  add EXPORT_SYMBOL() for dst_dev_put() in patch 05
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 273889e3 44ebe791
......@@ -563,7 +563,7 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
static int vrf_rt6_create(struct net_device *dev)
{
int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE;
int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
struct net_vrf *vrf = netdev_priv(dev);
struct net *net = dev_net(dev);
struct fib6_table *rt6i_table;
......@@ -583,8 +583,6 @@ static int vrf_rt6_create(struct net_device *dev)
if (!rt6)
goto out;
dst_hold(&rt6->dst);
rt6->rt6i_table = rt6i_table;
rt6->dst.output = vrf_output6;
......@@ -597,8 +595,6 @@ static int vrf_rt6_create(struct net_device *dev)
goto out;
}
dst_hold(&rt6_local->dst);
rt6_local->rt6i_idev = in6_dev_get(dev);
rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL;
rt6_local->rt6i_table = rt6i_table;
......
......@@ -51,13 +51,11 @@ struct dst_entry {
#define DST_HOST 0x0001
#define DST_NOXFRM 0x0002
#define DST_NOPOLICY 0x0004
#define DST_NOHASH 0x0008
#define DST_NOCACHE 0x0010
#define DST_NOCOUNT 0x0020
#define DST_FAKE_RTABLE 0x0040
#define DST_XFRM_TUNNEL 0x0080
#define DST_XFRM_QUEUE 0x0100
#define DST_METADATA 0x0200
#define DST_NOCOUNT 0x0008
#define DST_FAKE_RTABLE 0x0010
#define DST_XFRM_TUNNEL 0x0020
#define DST_XFRM_QUEUE 0x0040
#define DST_METADATA 0x0080
short error;
......@@ -253,7 +251,7 @@ static inline void dst_hold(struct dst_entry *dst)
* __pad_to_align_refcnt declaration in struct dst_entry
*/
BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
atomic_inc(&dst->__refcnt);
WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
}
static inline void dst_use(struct dst_entry *dst, unsigned long time)
......@@ -278,6 +276,8 @@ static inline struct dst_entry *dst_clone(struct dst_entry *dst)
void dst_release(struct dst_entry *dst);
void dst_release_immediate(struct dst_entry *dst);
static inline void refdst_drop(unsigned long refdst)
{
if (!(refdst & SKB_DST_NOREF))
......@@ -334,10 +334,7 @@ static inline void skb_dst_force(struct sk_buff *skb)
*/
static inline bool dst_hold_safe(struct dst_entry *dst)
{
if (dst->flags & DST_NOCACHE)
return atomic_inc_not_zero(&dst->__refcnt);
dst_hold(dst);
return true;
return atomic_inc_not_zero(&dst->__refcnt);
}
/**
......@@ -423,26 +420,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
unsigned short flags);
void __dst_free(struct dst_entry *dst);
struct dst_entry *dst_destroy(struct dst_entry *dst);
static inline void dst_free(struct dst_entry *dst)
{
if (dst->obsolete > 0)
return;
if (!atomic_read(&dst->__refcnt)) {
dst = dst_destroy(dst);
if (!dst)
return;
}
__dst_free(dst);
}
static inline void dst_rcu_free(struct rcu_head *head)
{
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst_free(dst);
}
void dst_dev_put(struct dst_entry *dst);
static inline void dst_confirm(struct dst_entry *dst)
{
......@@ -505,8 +484,6 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
return dst;
}
void dst_subsys_init(void);
/* Flags for xfrm_lookup flags argument. */
enum {
XFRM_LOOKUP_ICMP = 1 << 0,
......
......@@ -170,7 +170,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_PCPU ||
(unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
(unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
rt = (struct rt6_info *)(rt->dst.from);
return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
......
......@@ -116,7 +116,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif, int flags);
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
int icmp6_dst_gc(void);
void fib6_force_start_gc(struct net *net);
......
......@@ -190,7 +190,9 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
rcu_read_lock();
err = ip_route_input_noref(skb, dst, src, tos, devin);
if (!err)
skb_dst_force(skb);
skb_dst_force_safe(skb);
if (!skb_dst(skb))
err = -EINVAL;
rcu_read_unlock();
return err;
......
......@@ -8681,7 +8681,6 @@ static int __init net_dev_init(void)
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
out:
return rc;
......
......@@ -42,108 +42,6 @@
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
static struct {
spinlock_t lock;
struct dst_entry *list;
unsigned long timer_inc;
unsigned long timer_expires;
} dst_garbage = {
.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
.timer_inc = DST_GC_MAX,
};
static void dst_gc_task(struct work_struct *work);
static void ___dst_free(struct dst_entry *dst);
static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
static DEFINE_MUTEX(dst_gc_mutex);
/*
* long lived entries are maintained in this list, guarded by dst_gc_mutex
*/
static struct dst_entry *dst_busy_list;
static void dst_gc_task(struct work_struct *work)
{
int delayed = 0;
int work_performed = 0;
unsigned long expires = ~0L;
struct dst_entry *dst, *next, head;
struct dst_entry *last = &head;
mutex_lock(&dst_gc_mutex);
next = dst_busy_list;
loop:
while ((dst = next) != NULL) {
next = dst->next;
prefetch(&next->next);
cond_resched();
if (likely(atomic_read(&dst->__refcnt))) {
last->next = dst;
last = dst;
delayed++;
continue;
}
work_performed++;
dst = dst_destroy(dst);
if (dst) {
/* NOHASH and still referenced. Unless it is already
* on gc list, invalidate it and add to gc list.
*
* Note: this is temporary. Actually, NOHASH dst's
* must be obsoleted when parent is obsoleted.
* But we do not have state "obsoleted, but
* referenced by parent", so it is right.
*/
if (dst->obsolete > 0)
continue;
___dst_free(dst);
dst->next = next;
next = dst;
}
}
spin_lock_bh(&dst_garbage.lock);
next = dst_garbage.list;
if (next) {
dst_garbage.list = NULL;
spin_unlock_bh(&dst_garbage.lock);
goto loop;
}
last->next = NULL;
dst_busy_list = head.next;
if (!dst_busy_list)
dst_garbage.timer_inc = DST_GC_MAX;
else {
/*
* if we freed less than 1/10 of delayed entries,
* we can sleep longer.
*/
if (work_performed <= delayed/10) {
dst_garbage.timer_expires += dst_garbage.timer_inc;
if (dst_garbage.timer_expires > DST_GC_MAX)
dst_garbage.timer_expires = DST_GC_MAX;
dst_garbage.timer_inc += DST_GC_INC;
} else {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
}
expires = dst_garbage.timer_expires;
/*
* if the next desired timer is more than 4 seconds in the
* future then round the timer to whole seconds
*/
if (expires > 4*HZ)
expires = round_jiffies_relative(expires);
schedule_delayed_work(&dst_gc_work, expires);
}
spin_unlock_bh(&dst_garbage.lock);
mutex_unlock(&dst_gc_mutex);
}
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
......@@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
static void ___dst_free(struct dst_entry *dst)
{
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
dst->input = dst_discard;
dst->output = dst_discard_out;
}
dst->obsolete = DST_OBSOLETE_DEAD;
}
void __dst_free(struct dst_entry *dst)
{
spin_lock_bh(&dst_garbage.lock);
___dst_free(dst);
dst->next = dst_garbage.list;
dst_garbage.list = dst;
if (dst_garbage.timer_inc > DST_GC_INC) {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
mod_delayed_work(system_wq, &dst_gc_work,
dst_garbage.timer_expires);
}
spin_unlock_bh(&dst_garbage.lock);
}
EXPORT_SYMBOL(__dst_free);
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
......@@ -269,20 +138,8 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst) {
int nohash = dst->flags & DST_NOHASH;
if (atomic_dec_and_test(&dst->__refcnt)) {
/* We were real parent of this dst, so kill child. */
if (nohash)
goto again;
} else {
/* Child is still referenced, return it for freeing. */
if (nohash)
return dst;
/* Child is still in his hash table */
}
}
if (dst)
dst_release_immediate(dst);
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
......@@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head)
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
if (dst)
__dst_free(dst);
}
/* Operations to mark dst as DEAD and clean up the net device referenced
* by dst:
* 1. put the dst under loopback interface and discard all tx/rx packets
* on this route.
* 2. release the net_device
* This function should be called when removing routes from the fib tree
* in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
* make the next dst_ops->check() fail.
*/
void dst_dev_put(struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
dst->obsolete = DST_OBSOLETE_DEAD;
if (dst->ops->ifdown)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
}
EXPORT_SYMBOL(dst_dev_put);
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
if (!newrefcnt && unlikely(nocache))
if (!newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
void dst_release_immediate(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
if (!newrefcnt)
dst_destroy(dst);
}
}
EXPORT_SYMBOL(dst_release_immediate);
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
......@@ -377,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
dst = &md_dst->dst;
dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
DST_METADATA | DST_NOCOUNT);
dst->input = dst_md_discard;
dst->output = dst_md_discard_out;
......@@ -423,86 +316,3 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
* now. _It_ _is_ _explicit_ _deliberate_
* _race_ _condition_.
*
* Commented and originally written by Alexey.
*/
static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int unregister)
{
if (dst->ops->ifdown)
dst->ops->ifdown(dst, dev, unregister);
if (dev != dst->dev)
return;
if (!unregister) {
dst->input = dst_discard;
dst->output = dst_discard_out;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
}
}
static int dst_dev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct dst_entry *dst, *last = NULL;
switch (event) {
case NETDEV_UNREGISTER_FINAL:
case NETDEV_DOWN:
mutex_lock(&dst_gc_mutex);
for (dst = dst_busy_list; dst; dst = dst->next) {
last = dst;
dst_ifdown(dst, dev, event != NETDEV_DOWN);
}
spin_lock_bh(&dst_garbage.lock);
dst = dst_garbage.list;
dst_garbage.list = NULL;
/* The code in dst_ifdown places a hold on the loopback device.
* If the gc entry processing is set to expire after a lengthy
* interval, this hold can cause netdev_wait_allrefs() to hang
* out and wait for a long time -- until the the loopback
* interface is released. If we're really unlucky, it'll emit
* pr_emerg messages to console too. Reset the interval here,
* so dst cleanups occur in a more timely fashion.
*/
if (dst_garbage.timer_inc > DST_GC_INC) {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
mod_delayed_work(system_wq, &dst_gc_work,
dst_garbage.timer_expires);
}
spin_unlock_bh(&dst_garbage.lock);
if (last)
last->next = dst;
else
dst_busy_list = dst;
for (; dst; dst = dst->next)
dst_ifdown(dst, dev, event != NETDEV_DOWN);
mutex_unlock(&dst_gc_mutex);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block dst_dev_notifier = {
.notifier_call = dst_dev_event,
.priority = -10, /* must be called after other network notifiers */
};
void __init dst_subsys_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}
......@@ -183,11 +183,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
return dn_rt_hash_mask & (unsigned int)tmp;
}
static inline void dnrt_free(struct dn_route *rt)
{
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
static void dn_dst_check_expire(unsigned long dummy)
{
int i;
......@@ -202,14 +197,15 @@ static void dn_dst_check_expire(unsigned long dummy)
spin_lock(&dn_rt_hash_table[i].lock);
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
if (atomic_read(&rt->dst.__refcnt) ||
(now - rt->dst.lastuse) < expire) {
if (atomic_read(&rt->dst.__refcnt) > 1 ||
(now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
dnrt_free(rt);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
spin_unlock(&dn_rt_hash_table[i].lock);
......@@ -235,14 +231,15 @@ static int dn_dst_gc(struct dst_ops *ops)
while ((rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
if (atomic_read(&rt->dst.__refcnt) ||
(now - rt->dst.lastuse) < expire) {
if (atomic_read(&rt->dst.__refcnt) > 1 ||
(now - rt->dst.lastuse) < expire) {
rtp = &rt->dst.dn_next;
continue;
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
dnrt_free(rt);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
break;
}
spin_unlock_bh(&dn_rt_hash_table[i].lock);
......@@ -344,7 +341,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
dst_use(&rth->dst, now);
spin_unlock_bh(&dn_rt_hash_table[hash].lock);
dst_free(&rt->dst);
dst_release_immediate(&rt->dst);
*rp = rth;
return 0;
}
......@@ -374,7 +371,8 @@ static void dn_run_flush(unsigned long dummy)
for(; rt; rt = next) {
next = rcu_dereference_raw(rt->dst.dn_next);
RCU_INIT_POINTER(rt->dst.dn_next, NULL);
dnrt_free(rt);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
nothing_to_declare:
......@@ -1181,7 +1179,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
......@@ -1215,6 +1213,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
/* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, (struct dn_route **)pprt);
done:
......@@ -1237,7 +1236,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *o
err = -ENOBUFS;
goto done;
e_neighbour:
dst_free(&rt->dst);
dst_release_immediate(&rt->dst);
goto e_nobufs;
}
......@@ -1445,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
......@@ -1491,6 +1490,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
goto e_neighbour;
hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
/* dn_insert_route() increments dst->__refcnt */
dn_insert_route(rt, hash, &rt);
skb_dst_set(skb, &rt->dst);
......@@ -1514,7 +1514,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
goto done;
e_neighbour:
dst_free(&rt->dst);
dst_release_immediate(&rt->dst);
goto done;
}
......
......@@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
* free_fib_info_rcu()
*/
dst_free(&rt->dst);
dst_dev_put(&rt->dst);
dst_release_immediate(&rt->dst);
}
static void free_nh_exceptions(struct fib_nh *nh)
......@@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
struct rtable *rt;
rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
if (rt)
dst_free(&rt->dst);
if (rt) {
dst_dev_put(&rt->dst);
dst_release_immediate(&rt->dst);
}
}
free_percpu(rtp);
}
......
......@@ -589,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
static inline void rt_free(struct rtable *rt)
{
call_rcu(&rt->dst.rcu_head, dst_rcu_free);
}
static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
......@@ -603,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
rt_free(rt);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
rt_free(rt);
dst_dev_put(&rt->dst);
dst_release(&rt->dst);
}
}
......@@ -1302,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr)
__be32 daddr, const bool do_cache)
{
bool ret = false;
......@@ -1331,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
if (!(rt->dst.flags & DST_NOCACHE)) {
if (do_cache) {
dst_hold(&rt->dst);
rcu_assign_pointer(*porig, rt);
if (orig)
rt_free(orig);
if (orig) {
dst_dev_put(&orig->dst);
dst_release(&orig->dst);
}
ret = true;
}
......@@ -1357,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
}
orig = *p;
/* hold dst before doing cmpxchg() to avoid race condition
* on this dst
*/
dst_hold(&rt->dst);
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
if (orig)
rt_free(orig);
} else
if (orig) {
dst_dev_put(&orig->dst);
dst_release(&orig->dst);
}
} else {
dst_release(&rt->dst);
ret = false;
}
return ret;
}
......@@ -1433,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt)
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
struct fib_info *fi, u16 type, u32 itag)
struct fib_info *fi, u16 type, u32 itag,
const bool do_cache)
{
bool cached = false;
......@@ -1454,8 +1463,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#endif
rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
else if (do_cache)
cached = rt_cache_route(nh, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
......@@ -1463,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
rt->dst.flags |= DST_NOCACHE;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
rt_add_uncached_list(rt);
......@@ -1486,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
(will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
(will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
......@@ -1730,7 +1738,8 @@ static int __mkroute_input(struct sk_buff *skb,
rth->dst.input = ip_forward;