diff options
| author | Justin Pettit <jpettit@nicira.com> | 2014-04-28 14:25:06 -0700 |
|---|---|---|
| committer | Justin Pettit <jpettit@nicira.com> | 2014-07-25 12:05:20 -0700 |
| commit | 816f3bca9f5d23a0cf3b2ec922382f72b7b1b0d6 (patch) | |
| tree | d0c40aa9be6af9ab84fb2d0b93d8152787806861 /datapath | |
| parent | df0e5f55763289e37f90d1f2464423f07478f372 (diff) | |
| download | openvswitch-elephant.tar.gz | |
Initial check-in of kernel-based elephant flow detection.elephant
Areas to work on:
- Doesn't populate "elephant-flows" field.
- Doesn't properly handle tunnels.
- Doesn't have clean way to query elephant table.
- Double-check locking.
- Should use names instead of number for mechanism.
- When changing detection mechanism, should clear old table.
- Breaks unit tests
Diffstat (limited to 'datapath')
| -rw-r--r-- | datapath/Modules.mk | 2 | ||||
| -rw-r--r-- | datapath/actions.c | 76 | ||||
| -rw-r--r-- | datapath/datapath.c | 32 | ||||
| -rw-r--r-- | datapath/datapath.h | 4 | ||||
| -rw-r--r-- | datapath/elephant.c | 517 | ||||
| -rw-r--r-- | datapath/elephant.h | 54 | ||||
| -rw-r--r-- | datapath/flow_netlink.c | 146 | ||||
| -rw-r--r-- | datapath/linux/.gitignore | 1 | ||||
| -rw-r--r-- | datapath/linux/compat/include/linux/kernel.h | 6 |
9 files changed, 835 insertions, 3 deletions
diff --git a/datapath/Modules.mk b/datapath/Modules.mk index 90e158cd2..a4b4d4644 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -10,6 +10,7 @@ openvswitch_sources = \ actions.c \ datapath.c \ dp_notify.c \ + elephant.c \ flow.c \ flow_netlink.c \ flow_table.c \ @@ -24,6 +25,7 @@ openvswitch_sources = \ openvswitch_headers = \ compat.h \ datapath.h \ + elephant.h \ flow.h \ flow_netlink.h \ flow_table.h \ diff --git a/datapath/actions.c b/datapath/actions.c index 39a21f4ab..2d18c6616 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -37,6 +37,7 @@ #include "datapath.h" #include "gso.h" #include "mpls.h" +#include "elephant.h" #include "vlan.h" #include "vport.h" @@ -583,6 +584,75 @@ static int sample(struct datapath *dp, struct sk_buff *skb, return do_execute_actions(dp, sample_skb, a, rem); } +static int elephant(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + struct sw_flow_key *key = OVS_CB(skb)->pkt_key; + uint32_t mech=0, arg1=0, arg2=0; + uint8_t dscp = U8_MAX; + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + /* We only process IP packets. */ + if (key->eth.type != htons(ETH_P_IP) && + key->eth.type != htons(ETH_P_IPV6)) + return 0; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_ELEPHANT_ATTR_DETECT_MECH: + mech = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_ARG1: + arg1 = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_ARG2: + arg2 = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_DSCP: + dscp = nla_get_u8(a); + break; + + case OVS_ELEPHANT_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + if (!is_elephant(skb, mech, arg1, arg2)) + return 0; + + if (dscp != U8_MAX) { + struct iphdr *nh = ip_hdr(skb); + int err; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ipv4_change_dsfield(nh, 0x03, dscp<<2); + } + + /* xxx We need to make sure that only "set" or userspace actions are + * xxx provided in the verification code. */ + + /* The only action with a side-effect that is allowed is the "set" + * action. Since the do_execute_actions() never consumes 'skb', a + * skb_get(skb) call prevents consumption by do_execute_actions(). + * Thus, it is safe to simply return the error code and let the + * caller (also do_execute_actions()) free skb on error. */ + skb_get(skb); + + return do_execute_actions(dp, skb, nla_data(acts_list), + nla_len(acts_list)); +} + static void execute_hash(struct sk_buff *skb, const struct nlattr *attr) { struct sw_flow_key *key = OVS_CB(skb)->pkt_key; @@ -751,6 +821,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); break; + + case OVS_ACTION_ATTR_ELEPHANT: + err = elephant(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; + break; } if (unlikely(err)) { diff --git a/datapath/datapath.c b/datapath/datapath.c index 94539ebff..cbe36eff8 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -55,6 +55,7 @@ #include <net/netns/generic.h> #include "datapath.h" +#include "elephant.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" @@ -198,6 +199,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); + ovs_elephant_tbl_destroy(dp->elephant_table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -1460,10 +1462,20 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_free_dp; + /* Allocate elephant table. */ + rcu_assign_pointer(dp->elephant_table, + ovs_elephant_tbl_alloc(ELEPHANT_TBL_MIN_BUCKETS)); + if (!dp->elephant_table) + goto err_destroy_table; + + err = ovs_elephant_dp_init(dp); + if (err) + goto err_destroy_elephant_table; + dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); if (!dp->stats_percpu) { err = -ENOMEM; - goto err_destroy_table; + goto err_elephant_dp_exit; } for_each_possible_cpu(i) { @@ -1530,6 +1542,10 @@ err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); +err_elephant_dp_exit: + ovs_elephant_dp_exit(dp); +err_destroy_elephant_table: + ovs_elephant_tbl_destroy(dp->elephant_table); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_free_dp: @@ -1562,6 +1578,8 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + ovs_elephant_dp_exit(dp); + /* RCU destroy the flow table */ call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1673,6 +1691,9 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_DP_CMD_NEW) < 0) break; + + ovs_elephant_print_flows(dp); + i++; } rcu_read_unlock(); @@ -2189,10 +2210,14 @@ static int __init dp_init(void) if (err) goto error_flow_exit; - err = register_pernet_device(&ovs_net_ops); + err = ovs_elephant_init(); if (err) goto error_vport_exit; + err = register_pernet_device(&ovs_net_ops); + if (err) + goto error_elephant_exit; + err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; @@ -2207,6 +2232,8 @@ error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); +error_elephant_exit: + ovs_elephant_exit(); error_vport_exit: ovs_vport_exit(); error_flow_exit: @@ -2221,6 +2248,7 @@ static void dp_cleanup(void) unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); rcu_barrier(); + ovs_elephant_exit(); ovs_vport_exit(); ovs_flow_exit(); } diff --git a/datapath/datapath.h b/datapath/datapath.h index d6dee50ad..82377e8d1 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -27,6 +27,7 @@ #include <linux/u64_stats_sync.h> #include "compat.h" +#include "elephant.h" #include "flow.h" #include "flow_table.h" #include "vlan.h" @@ -91,6 +92,9 @@ struct datapath { struct net *net; #endif + /* Elephant flow table. */ + struct elephant_table __rcu *elephant_table; + u32 user_features; }; diff --git a/datapath/elephant.c b/datapath/elephant.c new file mode 100644 index 000000000..4a1724d5e --- /dev/null +++ b/datapath/elephant.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "datapath.h" +#include "elephant.h" +#include "flow.h" +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/workqueue.h> + +struct elephant_flow { + struct rcu_head rcu; + struct hlist_node hash_node[2]; + u32 hash; + + struct sw_flow_key key; + + spinlock_t lock; /* Lock for values below. */ + unsigned long created; /* Time created (in jiffies). */ + unsigned long used; /* Last used time (in jiffies). */ + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + u64 tso_count; /* Number of TSO-sized packets. */ +}; + +#define ELEPHANT_CHECK_INTERVAL (1 * HZ) +#define ELEPHANT_FLOW_LIFE (5 * HZ) +static void elephant_check_table(struct work_struct *work); + +static struct kmem_cache *elephant_table; + +static void ovs_elephant_tbl_insert(struct elephant_table *table, + struct elephant_flow *flow, struct sw_flow_key *key, int key_len); +static void ovs_elephant_tbl_remove(struct elephant_table *table, + struct elephant_flow *flow); + +static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table, + struct sw_flow_key *key, int key_len); + + +void ovs_elephant_free(struct elephant_flow *flow); + +static inline int ovs_elephant_tbl_need_to_expand(struct elephant_table *table) +{ + return (table->count > table->n_buckets); +} + +static struct hlist_head *find_bucket(struct elephant_table *table, u32 hash) +{ + hash = jhash_1word(hash, table->hash_seed); + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head *), + n_buckets, GFP_ATOMIC); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_ATOMIC); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +struct elephant_table *ovs_elephant_tbl_alloc(int new_size) +{ + struct elephant_table *table = kmalloc(sizeof(*table), GFP_ATOMIC); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + table->node_ver = 0; + get_random_bytes(&table->hash_seed, sizeof(u32)); + + return table; +} + +void ovs_elephant_tbl_destroy(struct elephant_table *table) +{ + int i; + + if (!table) + return; + + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_elephant_free(flow); + } + } + + free_buckets(table->buckets); + kfree(table); +} + +static void elephant_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct elephant_table *table = container_of(rcu, struct elephant_table, rcu); + + ovs_elephant_tbl_destroy(table); +} + +void ovs_elephant_tbl_deferred_destroy(struct elephant_table *table) +{ + if (!table) + return; + + call_rcu(&table->rcu, elephant_tbl_destroy_rcu_cb); +} + +struct elephant_flow *ovs_elephant_tbl_next(struct elephant_table *table, u32 *bucket, u32 *last) +{ + struct elephant_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = table->node_ver; + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static void __elephant_tbl_insert(struct elephant_table *table, struct elephant_flow *flow) +{ + struct hlist_head *head; + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; +} + +static void elephant_table_copy_flows(struct elephant_table *old, struct elephant_table *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, head, hash_node[old_ver]) + __elephant_tbl_insert(new, flow); + } +} + +static struct elephant_table *__elephant_tbl_rehash(struct elephant_table *table, int n_buckets) +{ + struct elephant_table *new_table; + + new_table = ovs_elephant_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + elephant_table_copy_flows(table, new_table); + + return new_table; +} + +struct elephant_table *ovs_elephant_tbl_rehash(struct elephant_table *table) +{ + return __elephant_tbl_rehash(table, table->n_buckets); +} + +struct elephant_table *ovs_elephant_tbl_expand(struct elephant_table *table) +{ + return __elephant_tbl_rehash(table, table->n_buckets * 2); +} + +void ovs_elephant_free(struct elephant_flow *flow) +{ + if (unlikely(!flow)) + return; + + kmem_cache_free(elephant_table, flow); +} + +/* RCU callback used by ovs_elephant_flow_deferred_free. */ +static void rcu_free_elephant_flow_callback(struct rcu_head *rcu) +{ + struct elephant_flow *flow = container_of(rcu, struct elephant_flow, rcu); + + ovs_elephant_free(flow); +} + +/* Schedules 'flow' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_elephant_flow_deferred_free(struct elephant_flow *flow) +{ +/* xxx Still need this? */ + call_rcu(&flow->rcu, rcu_free_elephant_flow_callback); +} + +static u32 ovs_elephant_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) +{ + return jhash2((u32 *)((u8 *)key + key_start), + DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); +} + +static int flow_key_start(struct sw_flow_key *key) +{ + if (key->tun_key.ipv4_dst) + return 0; + else + return offsetof(struct sw_flow_key, phy); +} + +static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table, + struct sw_flow_key *key, int key_len) +{ + struct elephant_flow *flow; + struct hlist_head *head; + u8 *_key; + int key_start; + u32 hash; + + key_start = flow_key_start(key); + hash = ovs_elephant_flow_hash(key, key_start, key_len); + + _key = (u8 *) key + key_start; + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { + if (flow->hash == hash && + !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { + return flow; + } + } + return NULL; +} + +static void ovs_elephant_tbl_insert(struct elephant_table *table, + struct elephant_flow *flow, struct sw_flow_key *key, int key_len) +{ + flow->hash = ovs_elephant_flow_hash(key, flow_key_start(key), key_len); + memcpy(&flow->key, key, sizeof(flow->key)); + __elephant_tbl_insert(table, flow); +} + +static void ovs_elephant_tbl_remove(struct elephant_table *table, + struct elephant_flow *flow) +{ + hlist_del_rcu(&flow->hash_node[table->node_ver]); + table->count--; + BUG_ON(table->count < 0); +} + +static void elephant_check_table(struct work_struct *ws) +{ + struct elephant_table *table; + int i; + + table = container_of(ws, struct elephant_table, work.work); + + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + if (time_after(jiffies, flow->used + ELEPHANT_FLOW_LIFE)) { + ovs_elephant_tbl_remove(table, flow); + ovs_elephant_flow_deferred_free(flow); + } + } + } + + schedule_delayed_work(&table->work, ELEPHANT_CHECK_INTERVAL); +} + +int ovs_elephant_dp_init(struct datapath *dp) +{ + INIT_DELAYED_WORK(&dp->elephant_table->work, elephant_check_table); + schedule_delayed_work(&dp->elephant_table->work, ELEPHANT_CHECK_INTERVAL); + + return 0; +} + +void ovs_elephant_dp_exit(struct datapath *dp) +{ + cancel_delayed_work_sync(&dp->elephant_table->work); +} + +static struct elephant_flow *ovs_elephant_flow_alloc(void) +{ + struct elephant_flow *flow; + + flow = kmem_cache_alloc(elephant_table, GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&flow->lock); + + return flow; +} + +static void clear_stats(struct elephant_flow *flow) +{ + flow->created = jiffies; + flow->used = 0; + flow->packet_count = 0; + flow->byte_count = 0; + flow->tso_count = 0; +} + +static void print_flow(struct elephant_flow *flow) +{ + /* xxx Only supports non-tunneled IPv4! */ + printk("in_port(%d),ipv4(src=%#x,dst=%#x,proto=%d),tp(src=%d,dst=%d)," + " packets:%lld, bytes:%lld, tso:%lld, created:%d, used:%d\n", + flow->key.phy.in_port, ntohl(flow->key.ipv4.addr.src), + ntohl(flow->key.ipv4.addr.dst), + flow->key.ip.proto, ntohs(flow->key.tp.src), + ntohs(flow->key.tp.dst), + flow->packet_count, flow->byte_count, flow->tso_count, + jiffies_to_msecs(jiffies - flow->created), + jiffies_to_msecs(jiffies - flow->used)); +} + +void ovs_elephant_print_flows(struct datapath *dp) +{ + struct elephant_table *table = dp->elephant_table; + int i; + + printk("--- Elephant Flows ---\n"); + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + int ver = table->node_ver; + + hlist_for_each_entry(flow, head, hash_node[ver]) { + print_flow(flow); + } + } +} + +void ovs_elephant_used(struct elephant_flow *flow, const struct sk_buff *skb, + bool is_tso) +{ +/* xxx Is the spin lock safe? */ + spin_lock(&flow->lock); + flow->used = jiffies; + flow->packet_count++; + flow->byte_count += skb->len; + if (is_tso) + flow->tso_count++; + spin_unlock(&flow->lock); +} + +static bool byte_check(const struct elephant_flow *flow, + uint32_t byte_count, uint32_t num_secs) + +{ + if ((flow->byte_count >= byte_count) && + time_after(jiffies, flow->created + HZ * num_secs)) { + return true; + } else + return false; +} + +static bool tso_check(const struct elephant_flow *flow, + uint32_t tso_size, uint32_t tso_count) + +{ + if (flow->tso_count >= tso_count) { + return true; + } else + return false; +} + +bool is_elephant(const struct sk_buff *skb, uint32_t mech, + uint32_t arg1, uint32_t arg2) +{ + struct elephant_table *table; + struct sw_flow_key *key = OVS_CB(skb)->pkt_key; + const struct vport *p = OVS_CB(skb)->input_vport; + struct datapath *dp = p->dp; + struct sw_flow_key elephant_key; + struct elephant_flow *flow; + + if (mech == 0) { + /* Detection disabled */ + return false; + } + + /* Make a copy, since we need to zero-out the TCP flags */ + elephant_key = *key; + elephant_key.tp.flags = 0; + +/* xxx How should I do the locking here? */ + table = dp->elephant_table; + flow = ovs_elephant_tbl_lookup(table, &elephant_key, sizeof(elephant_key)); + if (!flow) { + /* Expand table, if necessary, to make room. */ + if (ovs_elephant_tbl_need_to_expand(table)) { + struct elephant_table *new_table; + + new_table = ovs_elephant_tbl_expand(table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->elephant_table, new_table); + ovs_elephant_tbl_deferred_destroy(table); + table = dp->elephant_table; + } + } + + /* Allocate flow. */ + flow = ovs_elephant_flow_alloc(); + if (IS_ERR(flow)) { + /* xxx Not the greatest error handling. */ + return false; + } + clear_stats(flow); + + /* Put flow in bucket. */ + ovs_elephant_tbl_insert(table, flow, &elephant_key, + sizeof(elephant_key)); + } + + if ((mech == 2) && (skb->len >= arg1)) + ovs_elephant_used(flow, skb, true); + else + ovs_elephant_used(flow, skb, false); + + if (mech == 1) { + /* Byte counters */ + return byte_check(flow, arg1, arg2); + } else if (mech == 2) { + /* TSO buffers */ + return tso_check(flow, arg1, arg2); + } + + return false; +} + +/* Initializes the elephant module. */ +int ovs_elephant_init(void) +{ + elephant_table = kmem_cache_create("sw_elephant", sizeof(struct sw_flow), + 0, 0, NULL); + if (elephant_table == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the elephant module. */ +void ovs_elephant_exit(void) +{ + kmem_cache_destroy(elephant_table); +} diff --git a/datapath/elephant.h b/datapath/elephant.h new file mode 100644 index 000000000..471ff773b --- /dev/null +++ b/datapath/elephant.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef ELEPHANT_H +#define ELEPHANT_H 1 + +#include <linux/flex_array.h> +#include <linux/skbuff.h> + +#include "flow.h" + +#define ELEPHANT_TBL_MIN_BUCKETS 1024 + +struct datapath; + +struct elephant_table { + /* xxx Need all these? */ + struct flex_array *buckets; + unsigned int count, n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + struct delayed_work work; +}; + +int ovs_elephant_dp_init(struct datapath *); +void ovs_elephant_dp_exit(struct datapath *); + +struct elephant_table *ovs_elephant_tbl_alloc(int new_size); +void ovs_elephant_tbl_destroy(struct elephant_table *); + +void ovs_elephant_print_flows(struct datapath *dp); +bool is_elephant(const struct sk_buff *, uint32_t mech, uint32_t arg1, + uint32_t arg2); + +int ovs_elephant_init(void); +void ovs_elephant_exit(void); + +#endif /* elephant.h */ diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c index e1eadbbb2..4593103d8 100644 --- a/datapath/flow_netlink.c +++ b/datapath/flow_netlink.c @@ -1433,6 +1433,81 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } +static int validate_and_copy_elephant(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) +{ + const struct nlattr *attrs[OVS_ELEPHANT_ATTR_MAX + 1]; + const struct nlattr *mech, *arg1, *arg2, *dscp, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_ELEPHANT_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + mech = attrs[OVS_ELEPHANT_ATTR_DETECT_MECH]; + if (!mech || nla_len(mech) != sizeof(u32)) + return -EINVAL; + + arg1 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG1]; + if (!arg1 || nla_len(arg1) != sizeof(u32)) + return -EINVAL; + + arg2 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG2]; + if (!arg2 || nla_len(arg2) != sizeof(u32)) + return -EINVAL; + + dscp = attrs[OVS_ELEPHANT_ATTR_DETECT_DSCP]; + if (!dscp || nla_len(dscp) != sizeof(u8)) + return -EINVAL; + + actions = attrs[OVS_ELEPHANT_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy elephant action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_ELEPHANT); + if (start < 0) + return start; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_MECH, + nla_data(mech), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG1, + nla_data(arg1), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG2, + nla_data(arg2), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_DSCP, + nla_data(dscp), sizeof(u8)); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_ELEPHANT_ATTR_ACTIONS); + if (st_acts < 0) + return st_acts; + + err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa, + eth_type, vlan_tci); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + static int validate_tp_port(const struct sw_flow_key *flow_key, __be16 eth_type) { @@ -1670,6 +1745,7 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, const struct nlattr *a; int rem, err; + /* xxx What do we need to do for elephants? */ if (depth >= SAMPLE_ACTION_DEPTH) return -EOVERFLOW; @@ -1685,7 +1761,8 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_ELEPHANT] = (u32)-1 }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1791,6 +1868,14 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_ELEPHANT: + err = validate_and_copy_elephant(a, key, depth, sfa, + eth_type, vlan_tci); + if (err) + return err; + skip_copy = true; + break; + default: return -EINVAL; } @@ -1851,6 +1936,58 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) return err; } +static int elephant_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_ELEPHANT); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_elephant; + + switch (type) { + case OVS_ELEPHANT_ATTR_DETECT_MECH: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_MECH, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_ARG1: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG1, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_ARG2: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG2, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_DSCP: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_DSCP, + sizeof(u8), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_ACTIONS: + st_elephant = nla_nest_start(skb, OVS_ELEPHANT_ATTR_ACTIONS); + if (!st_elephant) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_elephant); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -1904,6 +2041,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_ELEPHANT: + err = elephant_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore index be233fcc3..6c14295ed 100644 --- a/datapath/linux/.gitignore +++ b/datapath/linux/.gitignore @@ -11,6 +11,7 @@ /datapath.c /dp_dev.c /dp_notify.c +/elephant.c /exthdrs_core.c /flex_array.c /flow.c diff --git a/datapath/linux/compat/include/linux/kernel.h b/datapath/linux/compat/include/linux/kernel.h index 5dfe08e91..bbb04f18f 100644 --- a/datapath/linux/compat/include/linux/kernel.h +++ b/datapath/linux/compat/include/linux/kernel.h @@ -46,6 +46,12 @@ #endif +#ifndef U8_MAX +#define U8_MAX ((u8)(~0U)) +#define S8_MAX ((s8)(U8_MAX>>1)) +#define S8_MIN ((s8)(-S8_MAX - 1)) +#endif + #ifndef USHRT_MAX #define USHRT_MAX ((u16)(~0U)) #define SHRT_MAX ((s16)(USHRT_MAX>>1)) |
