summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c179
1 files changed, 146 insertions, 33 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 4906b44af850..751e5264fd49 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -98,6 +98,7 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
@@ -1130,7 +1131,7 @@ EXPORT_SYMBOL(__dev_get_by_flags);
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -4840,6 +4841,21 @@ int netif_rx_ni(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx_ni);
+int netif_rx_any_context(struct sk_buff *skb)
+{
+ /*
+ * If invoked from contexts which do not invoke bottom half
+ * processing either at return from interrupt or when softrqs are
+ * reenabled, use netif_rx_ni() which invokes bottomhalf processing
+ * directly.
+ */
+ if (in_interrupt())
+ return netif_rx(skb);
+ else
+ return netif_rx_ni(skb);
+}
+EXPORT_SYMBOL(netif_rx_any_context);
+
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -4914,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
+ struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4958,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
- skb_do_redirect(skb);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
return NULL;
case TC_ACT_CONSUMED:
return NULL;
@@ -5147,7 +5167,12 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ bool another = false;
+
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
if (!skb)
goto out;
@@ -5192,7 +5217,7 @@ skip_classify:
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
@@ -5441,15 +5466,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
if (new) {
u32 i;
+ mutex_lock(&new->aux->used_maps_mutex);
+
/* generic XDP does not work with DEVMAPs that can
* have a bpf_prog installed on an entry
*/
for (i = 0; i < new->aux->used_map_cnt; i++) {
- if (dev_map_can_have_prog(new->aux->used_maps[i]))
- return -EINVAL;
- if (cpu_map_prog_allowed(new->aux->used_maps[i]))
+ if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
+ cpu_map_prog_allowed(new->aux->used_maps[i])) {
+ mutex_unlock(&new->aux->used_maps_mutex);
return -EINVAL;
+ }
}
+
+ mutex_unlock(&new->aux->used_maps_mutex);
}
switch (xdp->command) {
@@ -5621,17 +5651,60 @@ static void flush_backlog(struct work_struct *work)
local_bh_enable();
}
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
+
+ local_irq_disable();
+ rps_lock(sd);
+
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
+ */
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ rps_unlock(sd);
+ local_irq_enable();
+
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
+ */
+ return true;
+}
+
static void flush_all_backlogs(void)
{
+ static cpumask_t flush_cpus;
unsigned int cpu;
+ /* since we are under rtnl lock protection we can use static data
+ * for the cpumask and avoid allocating on stack the possibly
+ * large mask
+ */
+ ASSERT_RTNL();
+
get_online_cpus();
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
+ cpumask_clear(&flush_cpus);
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+ cpumask_set_cpu(cpu, &flush_cpus);
+ }
+ }
- for_each_online_cpu(cpu)
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in rollback_registered_many() will take care of
+ * them
+ */
+ for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
put_online_cpus();
@@ -6293,7 +6366,7 @@ EXPORT_SYMBOL(__napi_schedule);
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
@@ -6533,8 +6606,7 @@ EXPORT_SYMBOL(napi_busy_loop);
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
@@ -6555,20 +6627,14 @@ static void napi_hash_add(struct napi_struct *napi)
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
-
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6600,7 +6666,11 @@ static void init_gro_hash(struct napi_struct *napi)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
@@ -6653,18 +6723,19 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ napi_hash_del(napi);
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
}
-EXPORT_SYMBOL(netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
@@ -9533,7 +9604,7 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -10037,6 +10108,8 @@ int netdev_refcnt_read(const struct net_device *dev)
}
EXPORT_SYMBOL(netdev_refcnt_read);
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
/**
* netdev_wait_allrefs - wait until all references are gone.
* @dev: target net_device
@@ -10052,7 +10125,7 @@ EXPORT_SYMBOL(netdev_refcnt_read);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
+ int wait = 0, refcnt;
linkwatch_forget_dev(dev);
@@ -10086,7 +10159,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
rebroadcast_time = jiffies;
}
- msleep(250);
+ if (!wait) {
+ rcu_barrier();
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
refcnt = netdev_refcnt_read(dev);
@@ -10249,6 +10328,40 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
}
EXPORT_SYMBOL(dev_get_stats);
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
+
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);