summaryrefslogtreecommitdiff
path: root/datapath
diff options
context:
space:
mode:
authorJustin Pettit <jpettit@nicira.com>2014-04-28 14:25:06 -0700
committerJustin Pettit <jpettit@nicira.com>2014-07-25 12:05:20 -0700
commit816f3bca9f5d23a0cf3b2ec922382f72b7b1b0d6 (patch)
treed0c40aa9be6af9ab84fb2d0b93d8152787806861 /datapath
parentdf0e5f55763289e37f90d1f2464423f07478f372 (diff)
downloadopenvswitch-elephant.tar.gz
Initial check-in of kernel-based elephant flow detection.elephant
Areas to work on: - Doesn't populate "elephant-flows" field. - Doesn't properly handle tunnels. - Doesn't have clean way to query elephant table. - Double-check locking. - Should use names instead of number for mechanism. - When changing detection mechanism, should clear old table. - Breaks unit tests
Diffstat (limited to 'datapath')
-rw-r--r--datapath/Modules.mk2
-rw-r--r--datapath/actions.c76
-rw-r--r--datapath/datapath.c32
-rw-r--r--datapath/datapath.h4
-rw-r--r--datapath/elephant.c517
-rw-r--r--datapath/elephant.h54
-rw-r--r--datapath/flow_netlink.c146
-rw-r--r--datapath/linux/.gitignore1
-rw-r--r--datapath/linux/compat/include/linux/kernel.h6
9 files changed, 835 insertions, 3 deletions
diff --git a/datapath/Modules.mk b/datapath/Modules.mk
index 90e158cd2..a4b4d4644 100644
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -10,6 +10,7 @@ openvswitch_sources = \
actions.c \
datapath.c \
dp_notify.c \
+ elephant.c \
flow.c \
flow_netlink.c \
flow_table.c \
@@ -24,6 +25,7 @@ openvswitch_sources = \
openvswitch_headers = \
compat.h \
datapath.h \
+ elephant.h \
flow.h \
flow_netlink.h \
flow_table.h \
diff --git a/datapath/actions.c b/datapath/actions.c
index 39a21f4ab..2d18c6616 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -37,6 +37,7 @@
#include "datapath.h"
#include "gso.h"
#include "mpls.h"
+#include "elephant.h"
#include "vlan.h"
#include "vport.h"
@@ -583,6 +584,75 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
return do_execute_actions(dp, sample_skb, a, rem);
}
+static int elephant(struct datapath *dp, struct sk_buff *skb,
+ const struct nlattr *attr)
+{
+ struct sw_flow_key *key = OVS_CB(skb)->pkt_key;
+ uint32_t mech=0, arg1=0, arg2=0;
+ uint8_t dscp = U8_MAX;
+ const struct nlattr *acts_list = NULL;
+ const struct nlattr *a;
+ int rem;
+
+ /* We only process IP packets. */
+ if (key->eth.type != htons(ETH_P_IP) &&
+ key->eth.type != htons(ETH_P_IPV6))
+ return 0;
+
+ for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
+ a = nla_next(a, &rem)) {
+ switch (nla_type(a)) {
+ case OVS_ELEPHANT_ATTR_DETECT_MECH:
+ mech = nla_get_u32(a);
+ break;
+
+ case OVS_ELEPHANT_ATTR_DETECT_ARG1:
+ arg1 = nla_get_u32(a);
+ break;
+
+ case OVS_ELEPHANT_ATTR_DETECT_ARG2:
+ arg2 = nla_get_u32(a);
+ break;
+
+ case OVS_ELEPHANT_ATTR_DETECT_DSCP:
+ dscp = nla_get_u8(a);
+ break;
+
+ case OVS_ELEPHANT_ATTR_ACTIONS:
+ acts_list = a;
+ break;
+ }
+ }
+
+ if (!is_elephant(skb, mech, arg1, arg2))
+ return 0;
+
+ if (dscp != U8_MAX) {
+ struct iphdr *nh = ip_hdr(skb);
+ int err;
+
+ err = make_writable(skb, skb_network_offset(skb) +
+ sizeof(struct iphdr));
+ if (unlikely(err))
+ return err;
+
+ ipv4_change_dsfield(nh, 0x03, dscp<<2);
+ }
+
+ /* xxx We need to make sure that only "set" or userspace actions are
+ * xxx provided in the verification code. */
+
+ /* The only action with a side-effect that is allowed is the "set"
+ * action. Since the do_execute_actions() never consumes 'skb', a
+ * skb_get(skb) call prevents consumption by do_execute_actions().
+ * Thus, it is safe to simply return the error code and let the
+ * caller (also do_execute_actions()) free skb on error. */
+ skb_get(skb);
+
+ return do_execute_actions(dp, skb, nla_data(acts_list),
+ nla_len(acts_list));
+}
+
static void execute_hash(struct sk_buff *skb, const struct nlattr *attr)
{
struct sw_flow_key *key = OVS_CB(skb)->pkt_key;
@@ -751,6 +821,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
case OVS_ACTION_ATTR_SAMPLE:
err = sample(dp, skb, a);
break;
+
+ case OVS_ACTION_ATTR_ELEPHANT:
+ err = elephant(dp, skb, a);
+ if (unlikely(err)) /* skb already freed. */
+ return err;
+ break;
}
if (unlikely(err)) {
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 94539ebff..cbe36eff8 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include "datapath.h"
+#include "elephant.h"
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
@@ -198,6 +199,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
struct datapath *dp = container_of(rcu, struct datapath, rcu);
ovs_flow_tbl_destroy(&dp->table);
+ ovs_elephant_tbl_destroy(dp->elephant_table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
@@ -1460,10 +1462,20 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_free_dp;
+ /* Allocate elephant table. */
+ rcu_assign_pointer(dp->elephant_table,
+ ovs_elephant_tbl_alloc(ELEPHANT_TBL_MIN_BUCKETS));
+ if (!dp->elephant_table)
+ goto err_destroy_table;
+
+ err = ovs_elephant_dp_init(dp);
+ if (err)
+ goto err_destroy_elephant_table;
+
dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
if (!dp->stats_percpu) {
err = -ENOMEM;
- goto err_destroy_table;
+ goto err_elephant_dp_exit;
}
for_each_possible_cpu(i) {
@@ -1530,6 +1542,10 @@ err_destroy_ports_array:
kfree(dp->ports);
err_destroy_percpu:
free_percpu(dp->stats_percpu);
+err_elephant_dp_exit:
+ ovs_elephant_dp_exit(dp);
+err_destroy_elephant_table:
+ ovs_elephant_tbl_destroy(dp->elephant_table);
err_destroy_table:
ovs_flow_tbl_destroy(&dp->table);
err_free_dp:
@@ -1562,6 +1578,8 @@ static void __dp_destroy(struct datapath *dp)
*/
ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
+ ovs_elephant_dp_exit(dp);
+
/* RCU destroy the flow table */
call_rcu(&dp->rcu, destroy_dp_rcu);
}
@@ -1673,6 +1691,9 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_seq, NLM_F_MULTI,
OVS_DP_CMD_NEW) < 0)
break;
+
+ ovs_elephant_print_flows(dp);
+
i++;
}
rcu_read_unlock();
@@ -2189,10 +2210,14 @@ static int __init dp_init(void)
if (err)
goto error_flow_exit;
- err = register_pernet_device(&ovs_net_ops);
+ err = ovs_elephant_init();
if (err)
goto error_vport_exit;
+ err = register_pernet_device(&ovs_net_ops);
+ if (err)
+ goto error_elephant_exit;
+
err = register_netdevice_notifier(&ovs_dp_device_notifier);
if (err)
goto error_netns_exit;
@@ -2207,6 +2232,8 @@ error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_netns_exit:
unregister_pernet_device(&ovs_net_ops);
+error_elephant_exit:
+ ovs_elephant_exit();
error_vport_exit:
ovs_vport_exit();
error_flow_exit:
@@ -2221,6 +2248,7 @@ static void dp_cleanup(void)
unregister_netdevice_notifier(&ovs_dp_device_notifier);
unregister_pernet_device(&ovs_net_ops);
rcu_barrier();
+ ovs_elephant_exit();
ovs_vport_exit();
ovs_flow_exit();
}
diff --git a/datapath/datapath.h b/datapath/datapath.h
index d6dee50ad..82377e8d1 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -27,6 +27,7 @@
#include <linux/u64_stats_sync.h>
#include "compat.h"
+#include "elephant.h"
#include "flow.h"
#include "flow_table.h"
#include "vlan.h"
@@ -91,6 +92,9 @@ struct datapath {
struct net *net;
#endif
+ /* Elephant flow table. */
+ struct elephant_table __rcu *elephant_table;
+
u32 user_features;
};
diff --git a/datapath/elephant.c b/datapath/elephant.c
new file mode 100644
index 000000000..4a1724d5e
--- /dev/null
+++ b/datapath/elephant.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "datapath.h"
+#include "elephant.h"
+#include "flow.h"
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/workqueue.h>
+
+struct elephant_flow {
+ struct rcu_head rcu;
+ struct hlist_node hash_node[2];
+ u32 hash;
+
+ struct sw_flow_key key;
+
+ spinlock_t lock; /* Lock for values below. */
+ unsigned long created; /* Time created (in jiffies). */
+ unsigned long used; /* Last used time (in jiffies). */
+ u64 packet_count; /* Number of packets matched. */
+ u64 byte_count; /* Number of bytes matched. */
+ u64 tso_count; /* Number of TSO-sized packets. */
+};
+
+#define ELEPHANT_CHECK_INTERVAL (1 * HZ)
+#define ELEPHANT_FLOW_LIFE (5 * HZ)
+static void elephant_check_table(struct work_struct *work);
+
+static struct kmem_cache *elephant_table;
+
+static void ovs_elephant_tbl_insert(struct elephant_table *table,
+ struct elephant_flow *flow, struct sw_flow_key *key, int key_len);
+static void ovs_elephant_tbl_remove(struct elephant_table *table,
+ struct elephant_flow *flow);
+
+static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table,
+ struct sw_flow_key *key, int key_len);
+
+
+void ovs_elephant_free(struct elephant_flow *flow);
+
+static inline int ovs_elephant_tbl_need_to_expand(struct elephant_table *table)
+{
+ return (table->count > table->n_buckets);
+}
+
+static struct hlist_head *find_bucket(struct elephant_table *table, u32 hash)
+{
+ hash = jhash_1word(hash, table->hash_seed);
+ return flex_array_get(table->buckets,
+ (hash & (table->n_buckets - 1)));
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head *),
+ n_buckets, GFP_ATOMIC);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_ATOMIC);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+struct elephant_table *ovs_elephant_tbl_alloc(int new_size)
+{
+ struct elephant_table *table = kmalloc(sizeof(*table), GFP_ATOMIC);
+
+ if (!table)
+ return NULL;
+
+ table->buckets = alloc_buckets(new_size);
+
+ if (!table->buckets) {
+ kfree(table);
+ return NULL;
+ }
+ table->n_buckets = new_size;
+ table->count = 0;
+ table->node_ver = 0;
+ get_random_bytes(&table->hash_seed, sizeof(u32));
+
+ return table;
+}
+
+void ovs_elephant_tbl_destroy(struct elephant_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->n_buckets; i++) {
+ struct elephant_flow *flow;
+ struct hlist_head *head = flex_array_get(table->buckets, i);
+ struct hlist_node *n;
+ int ver = table->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ hlist_del_rcu(&flow->hash_node[ver]);
+ ovs_elephant_free(flow);
+ }
+ }
+
+ free_buckets(table->buckets);
+ kfree(table);
+}
+
+static void elephant_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct elephant_table *table = container_of(rcu, struct elephant_table, rcu);
+
+ ovs_elephant_tbl_destroy(table);
+}
+
+void ovs_elephant_tbl_deferred_destroy(struct elephant_table *table)
+{
+ if (!table)
+ return;
+
+ call_rcu(&table->rcu, elephant_tbl_destroy_rcu_cb);
+}
+
+struct elephant_flow *ovs_elephant_tbl_next(struct elephant_table *table, u32 *bucket, u32 *last)
+{
+ struct elephant_flow *flow;
+ struct hlist_head *head;
+ int ver;
+ int i;
+
+ ver = table->node_ver;
+ while (*bucket < table->n_buckets) {
+ i = 0;
+ head = flex_array_get(table->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static void __elephant_tbl_insert(struct elephant_table *table, struct elephant_flow *flow)
+{
+ struct hlist_head *head;
+ head = find_bucket(table, flow->hash);
+ hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+ table->count++;
+}
+
+static void elephant_table_copy_flows(struct elephant_table *old, struct elephant_table *new)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct elephant_flow *flow;
+ struct hlist_head *head;
+
+ head = flex_array_get(old->buckets, i);
+
+ hlist_for_each_entry(flow, head, hash_node[old_ver])
+ __elephant_tbl_insert(new, flow);
+ }
+}
+
+static struct elephant_table *__elephant_tbl_rehash(struct elephant_table *table, int n_buckets)
+{
+ struct elephant_table *new_table;
+
+ new_table = ovs_elephant_tbl_alloc(n_buckets);
+ if (!new_table)
+ return ERR_PTR(-ENOMEM);
+
+ elephant_table_copy_flows(table, new_table);
+
+ return new_table;
+}
+
+struct elephant_table *ovs_elephant_tbl_rehash(struct elephant_table *table)
+{
+ return __elephant_tbl_rehash(table, table->n_buckets);
+}
+
+struct elephant_table *ovs_elephant_tbl_expand(struct elephant_table *table)
+{
+ return __elephant_tbl_rehash(table, table->n_buckets * 2);
+}
+
+void ovs_elephant_free(struct elephant_flow *flow)
+{
+ if (unlikely(!flow))
+ return;
+
+ kmem_cache_free(elephant_table, flow);
+}
+
+/* RCU callback used by ovs_elephant_flow_deferred_free. */
+static void rcu_free_elephant_flow_callback(struct rcu_head *rcu)
+{
+ struct elephant_flow *flow = container_of(rcu, struct elephant_flow, rcu);
+
+ ovs_elephant_free(flow);
+}
+
+/* Schedules 'flow' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_elephant_flow_deferred_free(struct elephant_flow *flow)
+{
+/* xxx Still need this? */
+ call_rcu(&flow->rcu, rcu_free_elephant_flow_callback);
+}
+
+static u32 ovs_elephant_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
+{
+ return jhash2((u32 *)((u8 *)key + key_start),
+ DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
+}
+
+static int flow_key_start(struct sw_flow_key *key)
+{
+ if (key->tun_key.ipv4_dst)
+ return 0;
+ else
+ return offsetof(struct sw_flow_key, phy);
+}
+
+static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table,
+ struct sw_flow_key *key, int key_len)
+{
+ struct elephant_flow *flow;
+ struct hlist_head *head;
+ u8 *_key;
+ int key_start;
+ u32 hash;
+
+ key_start = flow_key_start(key);
+ hash = ovs_elephant_flow_hash(key, key_start, key_len);
+
+ _key = (u8 *) key + key_start;
+ head = find_bucket(table, hash);
+ hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
+ if (flow->hash == hash &&
+ !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) {
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static void ovs_elephant_tbl_insert(struct elephant_table *table,
+ struct elephant_flow *flow, struct sw_flow_key *key, int key_len)
+{
+ flow->hash = ovs_elephant_flow_hash(key, flow_key_start(key), key_len);
+ memcpy(&flow->key, key, sizeof(flow->key));
+ __elephant_tbl_insert(table, flow);
+}
+
+static void ovs_elephant_tbl_remove(struct elephant_table *table,
+ struct elephant_flow *flow)
+{
+ hlist_del_rcu(&flow->hash_node[table->node_ver]);
+ table->count--;
+ BUG_ON(table->count < 0);
+}
+
+static void elephant_check_table(struct work_struct *ws)
+{
+ struct elephant_table *table;
+ int i;
+
+ table = container_of(ws, struct elephant_table, work.work);
+
+ for (i = 0; i < table->n_buckets; i++) {
+ struct elephant_flow *flow;
+ struct hlist_head *head = flex_array_get(table->buckets, i);
+ struct hlist_node *n;
+ int ver = table->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ if (time_after(jiffies, flow->used + ELEPHANT_FLOW_LIFE)) {
+ ovs_elephant_tbl_remove(table, flow);
+ ovs_elephant_flow_deferred_free(flow);
+ }
+ }
+ }
+
+ schedule_delayed_work(&table->work, ELEPHANT_CHECK_INTERVAL);
+}
+
+int ovs_elephant_dp_init(struct datapath *dp)
+{
+ INIT_DELAYED_WORK(&dp->elephant_table->work, elephant_check_table);
+ schedule_delayed_work(&dp->elephant_table->work, ELEPHANT_CHECK_INTERVAL);
+
+ return 0;
+}
+
+void ovs_elephant_dp_exit(struct datapath *dp)
+{
+ cancel_delayed_work_sync(&dp->elephant_table->work);
+}
+
+static struct elephant_flow *ovs_elephant_flow_alloc(void)
+{
+ struct elephant_flow *flow;
+
+ flow = kmem_cache_alloc(elephant_table, GFP_ATOMIC);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&flow->lock);
+
+ return flow;
+}
+
+static void clear_stats(struct elephant_flow *flow)
+{
+ flow->created = jiffies;
+ flow->used = 0;
+ flow->packet_count = 0;
+ flow->byte_count = 0;
+ flow->tso_count = 0;
+}
+
+static void print_flow(struct elephant_flow *flow)
+{
+ /* xxx Only supports non-tunneled IPv4! */
+ printk("in_port(%d),ipv4(src=%#x,dst=%#x,proto=%d),tp(src=%d,dst=%d),"
+ " packets:%lld, bytes:%lld, tso:%lld, created:%d, used:%d\n",
+ flow->key.phy.in_port, ntohl(flow->key.ipv4.addr.src),
+ ntohl(flow->key.ipv4.addr.dst),
+ flow->key.ip.proto, ntohs(flow->key.tp.src),
+ ntohs(flow->key.tp.dst),
+ flow->packet_count, flow->byte_count, flow->tso_count,
+ jiffies_to_msecs(jiffies - flow->created),
+ jiffies_to_msecs(jiffies - flow->used));
+}
+
+void ovs_elephant_print_flows(struct datapath *dp)
+{
+ struct elephant_table *table = dp->elephant_table;
+ int i;
+
+ printk("--- Elephant Flows ---\n");
+ for (i = 0; i < table->n_buckets; i++) {
+ struct elephant_flow *flow;
+ struct hlist_head *head = flex_array_get(table->buckets, i);
+ int ver = table->node_ver;
+
+ hlist_for_each_entry(flow, head, hash_node[ver]) {
+ print_flow(flow);
+ }
+ }
+}
+
+void ovs_elephant_used(struct elephant_flow *flow, const struct sk_buff *skb,
+ bool is_tso)
+{
+/* xxx Is the spin lock safe? */
+ spin_lock(&flow->lock);
+ flow->used = jiffies;
+ flow->packet_count++;
+ flow->byte_count += skb->len;
+ if (is_tso)
+ flow->tso_count++;
+ spin_unlock(&flow->lock);
+}
+
+static bool byte_check(const struct elephant_flow *flow,
+ uint32_t byte_count, uint32_t num_secs)
+
+{
+ if ((flow->byte_count >= byte_count) &&
+ time_after(jiffies, flow->created + HZ * num_secs)) {
+ return true;
+ } else
+ return false;
+}
+
+static bool tso_check(const struct elephant_flow *flow,
+ uint32_t tso_size, uint32_t tso_count)
+
+{
+ if (flow->tso_count >= tso_count) {
+ return true;
+ } else
+ return false;
+}
+
+bool is_elephant(const struct sk_buff *skb, uint32_t mech,
+ uint32_t arg1, uint32_t arg2)
+{
+ struct elephant_table *table;
+ struct sw_flow_key *key = OVS_CB(skb)->pkt_key;
+ const struct vport *p = OVS_CB(skb)->input_vport;
+ struct datapath *dp = p->dp;
+ struct sw_flow_key elephant_key;
+ struct elephant_flow *flow;
+
+ if (mech == 0) {
+ /* Detection disabled */
+ return false;
+ }
+
+ /* Make a copy, since we need to zero-out the TCP flags */
+ elephant_key = *key;
+ elephant_key.tp.flags = 0;
+
+/* xxx How should I do the locking here? */
+ table = dp->elephant_table;
+ flow = ovs_elephant_tbl_lookup(table, &elephant_key, sizeof(elephant_key));
+ if (!flow) {
+ /* Expand table, if necessary, to make room. */
+ if (ovs_elephant_tbl_need_to_expand(table)) {
+ struct elephant_table *new_table;
+
+ new_table = ovs_elephant_tbl_expand(table);
+ if (!IS_ERR(new_table)) {
+ rcu_assign_pointer(dp->elephant_table, new_table);
+ ovs_elephant_tbl_deferred_destroy(table);
+ table = dp->elephant_table;
+ }
+ }
+
+ /* Allocate flow. */
+ flow = ovs_elephant_flow_alloc();
+ if (IS_ERR(flow)) {
+ /* xxx Not the greatest error handling. */
+ return false;
+ }
+ clear_stats(flow);
+
+ /* Put flow in bucket. */
+ ovs_elephant_tbl_insert(table, flow, &elephant_key,
+ sizeof(elephant_key));
+ }
+
+ if ((mech == 2) && (skb->len >= arg1))
+ ovs_elephant_used(flow, skb, true);
+ else
+ ovs_elephant_used(flow, skb, false);
+
+ if (mech == 1) {
+ /* Byte counters */
+ return byte_check(flow, arg1, arg2);
+ } else if (mech == 2) {
+ /* TSO buffers */
+ return tso_check(flow, arg1, arg2);
+ }
+
+ return false;
+}
+
+/* Initializes the elephant module. */
+int ovs_elephant_init(void)
+{
+ elephant_table = kmem_cache_create("sw_elephant", sizeof(struct sw_flow),
+ 0, 0, NULL);
+ if (elephant_table == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Uninitializes the elephant module. */
+void ovs_elephant_exit(void)
+{
+ kmem_cache_destroy(elephant_table);
+}
diff --git a/datapath/elephant.h b/datapath/elephant.h
new file mode 100644
index 000000000..471ff773b
--- /dev/null
+++ b/datapath/elephant.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef ELEPHANT_H
+#define ELEPHANT_H 1
+
+#include <linux/flex_array.h>
+#include <linux/skbuff.h>
+
+#include "flow.h"
+
+#define ELEPHANT_TBL_MIN_BUCKETS 1024
+
+struct datapath;
+
+struct elephant_table {
+ /* xxx Need all these? */
+ struct flex_array *buckets;
+ unsigned int count, n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ struct delayed_work work;
+};
+
+int ovs_elephant_dp_init(struct datapath *);
+void ovs_elephant_dp_exit(struct datapath *);
+
+struct elephant_table *ovs_elephant_tbl_alloc(int new_size);
+void ovs_elephant_tbl_destroy(struct elephant_table *);
+
+void ovs_elephant_print_flows(struct datapath *dp);
+bool is_elephant(const struct sk_buff *, uint32_t mech, uint32_t arg1,
+ uint32_t arg2);
+
+int ovs_elephant_init(void);
+void ovs_elephant_exit(void);
+
+#endif /* elephant.h */
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index e1eadbbb2..4593103d8 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -1433,6 +1433,81 @@ static int validate_and_copy_sample(const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_elephant(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci)
+{
+ const struct nlattr *attrs[OVS_ELEPHANT_ATTR_MAX + 1];
+ const struct nlattr *mech, *arg1, *arg2, *dscp, *actions;
+ const struct nlattr *a;
+ int rem, start, err, st_acts;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_ELEPHANT_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ mech = attrs[OVS_ELEPHANT_ATTR_DETECT_MECH];
+ if (!mech || nla_len(mech) != sizeof(u32))
+ return -EINVAL;
+
+ arg1 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG1];
+ if (!arg1 || nla_len(arg1) != sizeof(u32))
+ return -EINVAL;
+
+ arg2 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG2];
+ if (!arg2 || nla_len(arg2) != sizeof(u32))
+ return -EINVAL;
+
+ dscp = attrs[OVS_ELEPHANT_ATTR_DETECT_DSCP];
+ if (!dscp || nla_len(dscp) != sizeof(u8))
+ return -EINVAL;
+
+ actions = attrs[OVS_ELEPHANT_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+
+ /* validation done, copy elephant action. */
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_ELEPHANT);
+ if (start < 0)
+ return start;
+ err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_MECH,
+ nla_data(mech), sizeof(u32));
+ if (err)
+ return err;
+ err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG1,
+ nla_data(arg1), sizeof(u32));
+ if (err)
+ return err;
+ err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG2,
+ nla_data(arg2), sizeof(u32));
+ if (err)
+ return err;
+ err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_DSCP,
+ nla_data(dscp), sizeof(u8));
+ if (err)
+ return err;
+ st_acts = add_nested_action_start(sfa, OVS_ELEPHANT_ATTR_ACTIONS);
+ if (st_acts < 0)
+ return st_acts;
+
+ err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa,
+ eth_type, vlan_tci);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, st_acts);
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
static int validate_tp_port(const struct sw_flow_key *flow_key,
__be16 eth_type)
{
@@ -1670,6 +1745,7 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr,
const struct nlattr *a;
int rem, err;
+ /* xxx What do we need to do for elephants? */
if (depth >= SAMPLE_ACTION_DEPTH)
return -EOVERFLOW;
@@ -1685,7 +1761,8 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr,
[OVS_ACTION_ATTR_POP_VLAN] = 0,
[OVS_ACTION_ATTR_SET] = (u32)-1,
[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
- [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash)
+ [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
+ [OVS_ACTION_ATTR_ELEPHANT] = (u32)-1
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -1791,6 +1868,14 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr,
skip_copy = true;
break;
+ case OVS_ACTION_ATTR_ELEPHANT:
+ err = validate_and_copy_elephant(a, key, depth, sfa,
+ eth_type, vlan_tci);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
default:
return -EINVAL;
}
@@ -1851,6 +1936,58 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
return err;
}
+static int elephant_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ struct nlattr *start;
+ int err = 0, rem;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_ELEPHANT);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ struct nlattr *st_elephant;
+
+ switch (type) {
+ case OVS_ELEPHANT_ATTR_DETECT_MECH:
+ if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_MECH,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_ELEPHANT_ATTR_DETECT_ARG1:
+ if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG1,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_ELEPHANT_ATTR_DETECT_ARG2:
+ if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG2,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_ELEPHANT_ATTR_DETECT_DSCP:
+ if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_DSCP,
+ sizeof(u8), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_ELEPHANT_ATTR_ACTIONS:
+ st_elephant = nla_nest_start(skb, OVS_ELEPHANT_ATTR_ACTIONS);
+ if (!st_elephant)
+ return -EMSGSIZE;
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, st_elephant);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return err;
+}
+
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -1904,6 +2041,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
if (err)
return err;
break;
+
+ case OVS_ACTION_ATTR_ELEPHANT:
+ err = elephant_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore
index be233fcc3..6c14295ed 100644
--- a/datapath/linux/.gitignore
+++ b/datapath/linux/.gitignore
@@ -11,6 +11,7 @@
/datapath.c
/dp_dev.c
/dp_notify.c
+/elephant.c
/exthdrs_core.c
/flex_array.c
/flow.c
diff --git a/datapath/linux/compat/include/linux/kernel.h b/datapath/linux/compat/include/linux/kernel.h
index 5dfe08e91..bbb04f18f 100644
--- a/datapath/linux/compat/include/linux/kernel.h
+++ b/datapath/linux/compat/include/linux/kernel.h
@@ -46,6 +46,12 @@
#endif
+#ifndef U8_MAX
+#define U8_MAX ((u8)(~0U))
+#define S8_MAX ((s8)(U8_MAX>>1))
+#define S8_MIN ((s8)(-S8_MAX - 1))
+#endif
+
#ifndef USHRT_MAX
#define USHRT_MAX ((u16)(~0U))
#define SHRT_MAX ((s16)(USHRT_MAX>>1))