diff options
author | Justin Pettit <jpettit@nicira.com> | 2014-04-28 14:25:06 -0700 |
---|---|---|
committer | Justin Pettit <jpettit@nicira.com> | 2014-07-25 12:05:20 -0700 |
commit | 816f3bca9f5d23a0cf3b2ec922382f72b7b1b0d6 (patch) | |
tree | d0c40aa9be6af9ab84fb2d0b93d8152787806861 | |
parent | df0e5f55763289e37f90d1f2464423f07478f372 (diff) | |
download | openvswitch-elephant.tar.gz |
Initial check-in of kernel-based elephant flow detection.elephant
Areas to work on:
- Doesn't populate "elephant-flows" field.
- Doesn't properly handle tunnels.
- Doesn't have clean way to query elephant table.
- Double-check locking.
- Should use names instead of number for mechanism.
- When changing detection mechanism, should clear old table.
- Breaks unit tests
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | datapath/Modules.mk | 2 | ||||
-rw-r--r-- | datapath/actions.c | 76 | ||||
-rw-r--r-- | datapath/datapath.c | 32 | ||||
-rw-r--r-- | datapath/datapath.h | 4 | ||||
-rw-r--r-- | datapath/elephant.c | 517 | ||||
-rw-r--r-- | datapath/elephant.h | 54 | ||||
-rw-r--r-- | datapath/flow_netlink.c | 146 | ||||
-rw-r--r-- | datapath/linux/.gitignore | 1 | ||||
-rw-r--r-- | datapath/linux/compat/include/linux/kernel.h | 6 | ||||
-rw-r--r-- | include/linux/openvswitch.h | 29 | ||||
-rw-r--r-- | lib/dpif-netdev.c | 1 | ||||
-rw-r--r-- | lib/dpif.c | 1 | ||||
-rw-r--r-- | lib/odp-execute.c | 4 | ||||
-rw-r--r-- | lib/odp-util.c | 78 | ||||
-rw-r--r-- | ofproto/automake.mk | 2 | ||||
-rw-r--r-- | ofproto/ofproto-dpif-elephant.c | 58 | ||||
-rw-r--r-- | ofproto/ofproto-dpif-elephant.h | 40 | ||||
-rw-r--r-- | ofproto/ofproto-dpif-xlate.c | 70 | ||||
-rw-r--r-- | ofproto/ofproto-dpif-xlate.h | 6 | ||||
-rw-r--r-- | ofproto/ofproto-dpif.c | 41 | ||||
-rw-r--r-- | ofproto/ofproto-dpif.h | 2 | ||||
-rw-r--r-- | ofproto/ofproto-provider.h | 14 | ||||
-rw-r--r-- | ofproto/ofproto.c | 19 | ||||
-rw-r--r-- | ofproto/ofproto.h | 3 | ||||
-rw-r--r-- | vswitchd/bridge.c | 42 | ||||
-rw-r--r-- | vswitchd/vswitch.ovsschema | 7 | ||||
-rw-r--r-- | vswitchd/vswitch.xml | 69 |
28 files changed, 1311 insertions, 15 deletions
diff --git a/configure.ac b/configure.ac index 971c7b373..8391337c6 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 2.3.90, bugs@openvswitch.org) +AC_INIT(openvswitch, 2.3.90-ele1, bugs@openvswitch.org) AC_CONFIG_SRCDIR([datapath/datapath.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/datapath/Modules.mk b/datapath/Modules.mk index 90e158cd2..a4b4d4644 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -10,6 +10,7 @@ openvswitch_sources = \ actions.c \ datapath.c \ dp_notify.c \ + elephant.c \ flow.c \ flow_netlink.c \ flow_table.c \ @@ -24,6 +25,7 @@ openvswitch_sources = \ openvswitch_headers = \ compat.h \ datapath.h \ + elephant.h \ flow.h \ flow_netlink.h \ flow_table.h \ diff --git a/datapath/actions.c b/datapath/actions.c index 39a21f4ab..2d18c6616 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -37,6 +37,7 @@ #include "datapath.h" #include "gso.h" #include "mpls.h" +#include "elephant.h" #include "vlan.h" #include "vport.h" @@ -583,6 +584,75 @@ static int sample(struct datapath *dp, struct sk_buff *skb, return do_execute_actions(dp, sample_skb, a, rem); } +static int elephant(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + struct sw_flow_key *key = OVS_CB(skb)->pkt_key; + uint32_t mech=0, arg1=0, arg2=0; + uint8_t dscp = U8_MAX; + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + /* We only process IP packets. */ + if (key->eth.type != htons(ETH_P_IP) && + key->eth.type != htons(ETH_P_IPV6)) + return 0; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_ELEPHANT_ATTR_DETECT_MECH: + mech = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_ARG1: + arg1 = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_ARG2: + arg2 = nla_get_u32(a); + break; + + case OVS_ELEPHANT_ATTR_DETECT_DSCP: + dscp = nla_get_u8(a); + break; + + case OVS_ELEPHANT_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + if (!is_elephant(skb, mech, arg1, arg2)) + return 0; + + if (dscp != U8_MAX) { + struct iphdr *nh = ip_hdr(skb); + int err; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ipv4_change_dsfield(nh, 0x03, dscp<<2); + } + + /* xxx We need to make sure that only "set" or userspace actions are + * xxx provided in the verification code. */ + + /* The only action with a side-effect that is allowed is the "set" + * action. Since the do_execute_actions() never consumes 'skb', a + * skb_get(skb) call prevents consumption by do_execute_actions(). + * Thus, it is safe to simply return the error code and let the + * caller (also do_execute_actions()) free skb on error. */ + skb_get(skb); + + return do_execute_actions(dp, skb, nla_data(acts_list), + nla_len(acts_list)); +} + static void execute_hash(struct sk_buff *skb, const struct nlattr *attr) { struct sw_flow_key *key = OVS_CB(skb)->pkt_key; @@ -751,6 +821,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); break; + + case OVS_ACTION_ATTR_ELEPHANT: + err = elephant(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; + break; } if (unlikely(err)) { diff --git a/datapath/datapath.c b/datapath/datapath.c index 94539ebff..cbe36eff8 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -55,6 +55,7 @@ #include <net/netns/generic.h> #include "datapath.h" +#include "elephant.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" @@ -198,6 +199,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); + ovs_elephant_tbl_destroy(dp->elephant_table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -1460,10 +1462,20 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_free_dp; + /* Allocate elephant table. */ + rcu_assign_pointer(dp->elephant_table, + ovs_elephant_tbl_alloc(ELEPHANT_TBL_MIN_BUCKETS)); + if (!dp->elephant_table) + goto err_destroy_table; + + err = ovs_elephant_dp_init(dp); + if (err) + goto err_destroy_elephant_table; + dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); if (!dp->stats_percpu) { err = -ENOMEM; - goto err_destroy_table; + goto err_elephant_dp_exit; } for_each_possible_cpu(i) { @@ -1530,6 +1542,10 @@ err_destroy_ports_array: kfree(dp->ports); err_destroy_percpu: free_percpu(dp->stats_percpu); +err_elephant_dp_exit: + ovs_elephant_dp_exit(dp); +err_destroy_elephant_table: + ovs_elephant_tbl_destroy(dp->elephant_table); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_free_dp: @@ -1562,6 +1578,8 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + ovs_elephant_dp_exit(dp); + /* RCU destroy the flow table */ call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1673,6 +1691,9 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_DP_CMD_NEW) < 0) break; + + ovs_elephant_print_flows(dp); + i++; } rcu_read_unlock(); @@ -2189,10 +2210,14 @@ static int __init dp_init(void) if (err) goto error_flow_exit; - err = register_pernet_device(&ovs_net_ops); + err = ovs_elephant_init(); if (err) goto error_vport_exit; + err = register_pernet_device(&ovs_net_ops); + if (err) + goto error_elephant_exit; + err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; @@ -2207,6 +2232,8 @@ error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); +error_elephant_exit: + ovs_elephant_exit(); error_vport_exit: ovs_vport_exit(); error_flow_exit: @@ -2221,6 +2248,7 @@ static void dp_cleanup(void) unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); rcu_barrier(); + ovs_elephant_exit(); ovs_vport_exit(); ovs_flow_exit(); } diff --git a/datapath/datapath.h b/datapath/datapath.h index d6dee50ad..82377e8d1 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -27,6 +27,7 @@ #include <linux/u64_stats_sync.h> #include "compat.h" +#include "elephant.h" #include "flow.h" #include "flow_table.h" #include "vlan.h" @@ -91,6 +92,9 @@ struct datapath { struct net *net; #endif + /* Elephant flow table. */ + struct elephant_table __rcu *elephant_table; + u32 user_features; }; diff --git a/datapath/elephant.c b/datapath/elephant.c new file mode 100644 index 000000000..4a1724d5e --- /dev/null +++ b/datapath/elephant.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "datapath.h" +#include "elephant.h" +#include "flow.h" +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/workqueue.h> + +struct elephant_flow { + struct rcu_head rcu; + struct hlist_node hash_node[2]; + u32 hash; + + struct sw_flow_key key; + + spinlock_t lock; /* Lock for values below. */ + unsigned long created; /* Time created (in jiffies). */ + unsigned long used; /* Last used time (in jiffies). */ + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + u64 tso_count; /* Number of TSO-sized packets. */ +}; + +#define ELEPHANT_CHECK_INTERVAL (1 * HZ) +#define ELEPHANT_FLOW_LIFE (5 * HZ) +static void elephant_check_table(struct work_struct *work); + +static struct kmem_cache *elephant_table; + +static void ovs_elephant_tbl_insert(struct elephant_table *table, + struct elephant_flow *flow, struct sw_flow_key *key, int key_len); +static void ovs_elephant_tbl_remove(struct elephant_table *table, + struct elephant_flow *flow); + +static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table, + struct sw_flow_key *key, int key_len); + + +void ovs_elephant_free(struct elephant_flow *flow); + +static inline int ovs_elephant_tbl_need_to_expand(struct elephant_table *table) +{ + return (table->count > table->n_buckets); +} + +static struct hlist_head *find_bucket(struct elephant_table *table, u32 hash) +{ + hash = jhash_1word(hash, table->hash_seed); + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head *), + n_buckets, GFP_ATOMIC); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_ATOMIC); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +struct elephant_table *ovs_elephant_tbl_alloc(int new_size) +{ + struct elephant_table *table = kmalloc(sizeof(*table), GFP_ATOMIC); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + table->node_ver = 0; + get_random_bytes(&table->hash_seed, sizeof(u32)); + + return table; +} + +void ovs_elephant_tbl_destroy(struct elephant_table *table) +{ + int i; + + if (!table) + return; + + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_elephant_free(flow); + } + } + + free_buckets(table->buckets); + kfree(table); +} + +static void elephant_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct elephant_table *table = container_of(rcu, struct elephant_table, rcu); + + ovs_elephant_tbl_destroy(table); +} + +void ovs_elephant_tbl_deferred_destroy(struct elephant_table *table) +{ + if (!table) + return; + + call_rcu(&table->rcu, elephant_tbl_destroy_rcu_cb); +} + +struct elephant_flow *ovs_elephant_tbl_next(struct elephant_table *table, u32 *bucket, u32 *last) +{ + struct elephant_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = table->node_ver; + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static void __elephant_tbl_insert(struct elephant_table *table, struct elephant_flow *flow) +{ + struct hlist_head *head; + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; +} + +static void elephant_table_copy_flows(struct elephant_table *old, struct elephant_table *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, head, hash_node[old_ver]) + __elephant_tbl_insert(new, flow); + } +} + +static struct elephant_table *__elephant_tbl_rehash(struct elephant_table *table, int n_buckets) +{ + struct elephant_table *new_table; + + new_table = ovs_elephant_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + elephant_table_copy_flows(table, new_table); + + return new_table; +} + +struct elephant_table *ovs_elephant_tbl_rehash(struct elephant_table *table) +{ + return __elephant_tbl_rehash(table, table->n_buckets); +} + +struct elephant_table *ovs_elephant_tbl_expand(struct elephant_table *table) +{ + return __elephant_tbl_rehash(table, table->n_buckets * 2); +} + +void ovs_elephant_free(struct elephant_flow *flow) +{ + if (unlikely(!flow)) + return; + + kmem_cache_free(elephant_table, flow); +} + +/* RCU callback used by ovs_elephant_flow_deferred_free. */ +static void rcu_free_elephant_flow_callback(struct rcu_head *rcu) +{ + struct elephant_flow *flow = container_of(rcu, struct elephant_flow, rcu); + + ovs_elephant_free(flow); +} + +/* Schedules 'flow' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_elephant_flow_deferred_free(struct elephant_flow *flow) +{ +/* xxx Still need this? */ + call_rcu(&flow->rcu, rcu_free_elephant_flow_callback); +} + +static u32 ovs_elephant_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) +{ + return jhash2((u32 *)((u8 *)key + key_start), + DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); +} + +static int flow_key_start(struct sw_flow_key *key) +{ + if (key->tun_key.ipv4_dst) + return 0; + else + return offsetof(struct sw_flow_key, phy); +} + +static struct elephant_flow *ovs_elephant_tbl_lookup(struct elephant_table *table, + struct sw_flow_key *key, int key_len) +{ + struct elephant_flow *flow; + struct hlist_head *head; + u8 *_key; + int key_start; + u32 hash; + + key_start = flow_key_start(key); + hash = ovs_elephant_flow_hash(key, key_start, key_len); + + _key = (u8 *) key + key_start; + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { + if (flow->hash == hash && + !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { + return flow; + } + } + return NULL; +} + +static void ovs_elephant_tbl_insert(struct elephant_table *table, + struct elephant_flow *flow, struct sw_flow_key *key, int key_len) +{ + flow->hash = ovs_elephant_flow_hash(key, flow_key_start(key), key_len); + memcpy(&flow->key, key, sizeof(flow->key)); + __elephant_tbl_insert(table, flow); +} + +static void ovs_elephant_tbl_remove(struct elephant_table *table, + struct elephant_flow *flow) +{ + hlist_del_rcu(&flow->hash_node[table->node_ver]); + table->count--; + BUG_ON(table->count < 0); +} + +static void elephant_check_table(struct work_struct *ws) +{ + struct elephant_table *table; + int i; + + table = container_of(ws, struct elephant_table, work.work); + + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + if (time_after(jiffies, flow->used + ELEPHANT_FLOW_LIFE)) { + ovs_elephant_tbl_remove(table, flow); + ovs_elephant_flow_deferred_free(flow); + } + } + } + + schedule_delayed_work(&table->work, ELEPHANT_CHECK_INTERVAL); +} + +int ovs_elephant_dp_init(struct datapath *dp) +{ + INIT_DELAYED_WORK(&dp->elephant_table->work, elephant_check_table); + schedule_delayed_work(&dp->elephant_table->work, ELEPHANT_CHECK_INTERVAL); + + return 0; +} + +void ovs_elephant_dp_exit(struct datapath *dp) +{ + cancel_delayed_work_sync(&dp->elephant_table->work); +} + +static struct elephant_flow *ovs_elephant_flow_alloc(void) +{ + struct elephant_flow *flow; + + flow = kmem_cache_alloc(elephant_table, GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&flow->lock); + + return flow; +} + +static void clear_stats(struct elephant_flow *flow) +{ + flow->created = jiffies; + flow->used = 0; + flow->packet_count = 0; + flow->byte_count = 0; + flow->tso_count = 0; +} + +static void print_flow(struct elephant_flow *flow) +{ + /* xxx Only supports non-tunneled IPv4! */ + printk("in_port(%d),ipv4(src=%#x,dst=%#x,proto=%d),tp(src=%d,dst=%d)," + " packets:%lld, bytes:%lld, tso:%lld, created:%d, used:%d\n", + flow->key.phy.in_port, ntohl(flow->key.ipv4.addr.src), + ntohl(flow->key.ipv4.addr.dst), + flow->key.ip.proto, ntohs(flow->key.tp.src), + ntohs(flow->key.tp.dst), + flow->packet_count, flow->byte_count, flow->tso_count, + jiffies_to_msecs(jiffies - flow->created), + jiffies_to_msecs(jiffies - flow->used)); +} + +void ovs_elephant_print_flows(struct datapath *dp) +{ + struct elephant_table *table = dp->elephant_table; + int i; + + printk("--- Elephant Flows ---\n"); + for (i = 0; i < table->n_buckets; i++) { + struct elephant_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + int ver = table->node_ver; + + hlist_for_each_entry(flow, head, hash_node[ver]) { + print_flow(flow); + } + } +} + +void ovs_elephant_used(struct elephant_flow *flow, const struct sk_buff *skb, + bool is_tso) +{ +/* xxx Is the spin lock safe? */ + spin_lock(&flow->lock); + flow->used = jiffies; + flow->packet_count++; + flow->byte_count += skb->len; + if (is_tso) + flow->tso_count++; + spin_unlock(&flow->lock); +} + +static bool byte_check(const struct elephant_flow *flow, + uint32_t byte_count, uint32_t num_secs) + +{ + if ((flow->byte_count >= byte_count) && + time_after(jiffies, flow->created + HZ * num_secs)) { + return true; + } else + return false; +} + +static bool tso_check(const struct elephant_flow *flow, + uint32_t tso_size, uint32_t tso_count) + +{ + if (flow->tso_count >= tso_count) { + return true; + } else + return false; +} + +bool is_elephant(const struct sk_buff *skb, uint32_t mech, + uint32_t arg1, uint32_t arg2) +{ + struct elephant_table *table; + struct sw_flow_key *key = OVS_CB(skb)->pkt_key; + const struct vport *p = OVS_CB(skb)->input_vport; + struct datapath *dp = p->dp; + struct sw_flow_key elephant_key; + struct elephant_flow *flow; + + if (mech == 0) { + /* Detection disabled */ + return false; + } + + /* Make a copy, since we need to zero-out the TCP flags */ + elephant_key = *key; + elephant_key.tp.flags = 0; + +/* xxx How should I do the locking here? */ + table = dp->elephant_table; + flow = ovs_elephant_tbl_lookup(table, &elephant_key, sizeof(elephant_key)); + if (!flow) { + /* Expand table, if necessary, to make room. */ + if (ovs_elephant_tbl_need_to_expand(table)) { + struct elephant_table *new_table; + + new_table = ovs_elephant_tbl_expand(table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->elephant_table, new_table); + ovs_elephant_tbl_deferred_destroy(table); + table = dp->elephant_table; + } + } + + /* Allocate flow. */ + flow = ovs_elephant_flow_alloc(); + if (IS_ERR(flow)) { + /* xxx Not the greatest error handling. */ + return false; + } + clear_stats(flow); + + /* Put flow in bucket. */ + ovs_elephant_tbl_insert(table, flow, &elephant_key, + sizeof(elephant_key)); + } + + if ((mech == 2) && (skb->len >= arg1)) + ovs_elephant_used(flow, skb, true); + else + ovs_elephant_used(flow, skb, false); + + if (mech == 1) { + /* Byte counters */ + return byte_check(flow, arg1, arg2); + } else if (mech == 2) { + /* TSO buffers */ + return tso_check(flow, arg1, arg2); + } + + return false; +} + +/* Initializes the elephant module. */ +int ovs_elephant_init(void) +{ + elephant_table = kmem_cache_create("sw_elephant", sizeof(struct sw_flow), + 0, 0, NULL); + if (elephant_table == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the elephant module. */ +void ovs_elephant_exit(void) +{ + kmem_cache_destroy(elephant_table); +} diff --git a/datapath/elephant.h b/datapath/elephant.h new file mode 100644 index 000000000..471ff773b --- /dev/null +++ b/datapath/elephant.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef ELEPHANT_H +#define ELEPHANT_H 1 + +#include <linux/flex_array.h> +#include <linux/skbuff.h> + +#include "flow.h" + +#define ELEPHANT_TBL_MIN_BUCKETS 1024 + +struct datapath; + +struct elephant_table { + /* xxx Need all these? */ + struct flex_array *buckets; + unsigned int count, n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + struct delayed_work work; +}; + +int ovs_elephant_dp_init(struct datapath *); +void ovs_elephant_dp_exit(struct datapath *); + +struct elephant_table *ovs_elephant_tbl_alloc(int new_size); +void ovs_elephant_tbl_destroy(struct elephant_table *); + +void ovs_elephant_print_flows(struct datapath *dp); +bool is_elephant(const struct sk_buff *, uint32_t mech, uint32_t arg1, + uint32_t arg2); + +int ovs_elephant_init(void); +void ovs_elephant_exit(void); + +#endif /* elephant.h */ diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c index e1eadbbb2..4593103d8 100644 --- a/datapath/flow_netlink.c +++ b/datapath/flow_netlink.c @@ -1433,6 +1433,81 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } +static int validate_and_copy_elephant(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) +{ + const struct nlattr *attrs[OVS_ELEPHANT_ATTR_MAX + 1]; + const struct nlattr *mech, *arg1, *arg2, *dscp, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_ELEPHANT_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + mech = attrs[OVS_ELEPHANT_ATTR_DETECT_MECH]; + if (!mech || nla_len(mech) != sizeof(u32)) + return -EINVAL; + + arg1 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG1]; + if (!arg1 || nla_len(arg1) != sizeof(u32)) + return -EINVAL; + + arg2 = attrs[OVS_ELEPHANT_ATTR_DETECT_ARG2]; + if (!arg2 || nla_len(arg2) != sizeof(u32)) + return -EINVAL; + + dscp = attrs[OVS_ELEPHANT_ATTR_DETECT_DSCP]; + if (!dscp || nla_len(dscp) != sizeof(u8)) + return -EINVAL; + + actions = attrs[OVS_ELEPHANT_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy elephant action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_ELEPHANT); + if (start < 0) + return start; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_MECH, + nla_data(mech), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG1, + nla_data(arg1), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_ARG2, + nla_data(arg2), sizeof(u32)); + if (err) + return err; + err = add_action(sfa, OVS_ELEPHANT_ATTR_DETECT_DSCP, + nla_data(dscp), sizeof(u8)); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_ELEPHANT_ATTR_ACTIONS); + if (st_acts < 0) + return st_acts; + + err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa, + eth_type, vlan_tci); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + static int validate_tp_port(const struct sw_flow_key *flow_key, __be16 eth_type) { @@ -1670,6 +1745,7 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, const struct nlattr *a; int rem, err; + /* xxx What do we need to do for elephants? */ if (depth >= SAMPLE_ACTION_DEPTH) return -EOVERFLOW; @@ -1685,7 +1761,8 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_ELEPHANT] = (u32)-1 }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1791,6 +1868,14 @@ static int ovs_nla_copy_actions__(const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_ELEPHANT: + err = validate_and_copy_elephant(a, key, depth, sfa, + eth_type, vlan_tci); + if (err) + return err; + skip_copy = true; + break; + default: return -EINVAL; } @@ -1851,6 +1936,58 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) return err; } +static int elephant_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_ELEPHANT); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_elephant; + + switch (type) { + case OVS_ELEPHANT_ATTR_DETECT_MECH: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_MECH, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_ARG1: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG1, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_ARG2: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_ARG2, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_DETECT_DSCP: + if (nla_put(skb, OVS_ELEPHANT_ATTR_DETECT_DSCP, + sizeof(u8), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_ELEPHANT_ATTR_ACTIONS: + st_elephant = nla_nest_start(skb, OVS_ELEPHANT_ATTR_ACTIONS); + if (!st_elephant) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_elephant); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -1904,6 +2041,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_ELEPHANT: + err = elephant_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore index be233fcc3..6c14295ed 100644 --- a/datapath/linux/.gitignore +++ b/datapath/linux/.gitignore @@ -11,6 +11,7 @@ /datapath.c /dp_dev.c /dp_notify.c +/elephant.c /exthdrs_core.c /flex_array.c /flow.c diff --git a/datapath/linux/compat/include/linux/kernel.h b/datapath/linux/compat/include/linux/kernel.h index 5dfe08e91..bbb04f18f 100644 --- a/datapath/linux/compat/include/linux/kernel.h +++ b/datapath/linux/compat/include/linux/kernel.h @@ -46,6 +46,12 @@ #endif +#ifndef U8_MAX +#define U8_MAX ((u8)(~0U)) +#define S8_MAX ((s8)(U8_MAX>>1)) +#define S8_MIN ((s8)(-S8_MAX - 1)) +#endif + #ifndef USHRT_MAX #define USHRT_MAX ((u16)(~0U)) #define SHRT_MAX ((s16)(USHRT_MAX>>1)) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index 271a14ebf..38e7b4228 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -566,6 +566,34 @@ struct ovs_action_hash { }; /** + * + * xxx Very ugly. Should make configuration better, too. + * + * enum ovs_elephant_attr - Attributes for %OVS_ACTION_ATTR_ELEPHANT action. + * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with + * @OVS_ACTION_ATTR_ELEPHANT. A value of 0 samples no packets, a value of + * %UINT32_MAX samples all packets and intermediate values sample intermediate + * fractions of packets. + * @OVS_ELEPHANT_ATTR_ACTIONS: Set of actions to execute if flow is + * determined to be an elephant. Actions are passed as nested attributes. + * + * Executes the specified actions with the given probability on a per-packet + * basis. + */ +enum ovs_elephant_attr { + OVS_ELEPHANT_ATTR_UNSPEC, + OVS_ELEPHANT_ATTR_DETECT_MECH, /* u32 detection mechanism */ + OVS_ELEPHANT_ATTR_DETECT_ARG1, /* u32 detection first argument */ + OVS_ELEPHANT_ATTR_DETECT_ARG2, /* u32 detection second argument */ + OVS_ELEPHANT_ATTR_DETECT_DSCP, /* u8 DSCP value, 0 if unchanged */ + /* xxx The actions aren't currently being used. Consider removal. */ + OVS_ELEPHANT_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ + __OVS_ELEPHANT_ATTR_MAX, +}; + +#define OVS_ELEPHANT_ATTR_MAX (__OVS_ELEPHANT_ATTR_MAX - 1) + +/** * enum ovs_action_attr - Action types. * * @OVS_ACTION_ATTR_OUTPUT: Output packet to port. @@ -608,6 +636,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + OVS_ACTION_ATTR_ELEPHANT, /* Nested OVS_ACTION_ATTR_*. */ __OVS_ACTION_ATTR_MAX }; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 8422c8975..c5b3a5120 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -2343,6 +2343,7 @@ dp_execute_cb(void *aux_, struct dpif_packet **packets, int cnt, case OVS_ACTION_ATTR_POP_MPLS: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_ELEPHANT: case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); diff --git a/lib/dpif.c b/lib/dpif.c index a3258057c..e1d9686a2 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1122,6 +1122,7 @@ dpif_execute_helper_cb(void *aux_, struct dpif_packet **packets, int cnt, case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_ELEPHANT: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index cb89e72fa..0c5a32b08 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -314,6 +314,10 @@ odp_execute_actions__(void *dp, struct dpif_packet **packets, int cnt, } break; + case OVS_ACTION_ATTR_ELEPHANT: + /* xxx This isn't supported in userspace. */ + break; + case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); diff --git a/lib/odp-util.c b/lib/odp-util.c index 162d85a70..1fc10f940 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -83,6 +83,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_HASH: return sizeof(struct ovs_action_hash); case OVS_ACTION_ATTR_SET: return -2; case OVS_ACTION_ATTR_SAMPLE: return -2; + case OVS_ACTION_ATTR_ELEPHANT: return -2; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: @@ -180,6 +181,40 @@ format_odp_sample_action(struct ds *ds, const struct nlattr *attr) ds_put_format(ds, "))"); } +static void +format_odp_elephant_action(struct ds *ds, const struct nlattr *attr) +{ + static const struct nl_policy ovs_elephant_policy[] = { + [OVS_ELEPHANT_ATTR_DETECT_MECH] = { .type = NL_A_U32 }, + [OVS_ELEPHANT_ATTR_DETECT_ARG1] = { .type = NL_A_U32 }, + [OVS_ELEPHANT_ATTR_DETECT_ARG2] = { .type = NL_A_U32 }, + [OVS_ELEPHANT_ATTR_DETECT_DSCP] = { .type = NL_A_U8 }, + [OVS_ELEPHANT_ATTR_ACTIONS] = { .type = NL_A_NESTED } + }; + struct nlattr *a[ARRAY_SIZE(ovs_elephant_policy)]; + const struct nlattr *nla_acts; + int len; + + ds_put_cstr(ds, "elephant"); + + if (!nl_parse_nested(attr, ovs_elephant_policy, a, ARRAY_SIZE(a))) { + ds_put_cstr(ds, "(error)"); + return; + } + + ds_put_format(ds, "(mech=%d,arg1=%d,arg2=%d,dscp=%d,", + nl_attr_get_u32(a[OVS_ELEPHANT_ATTR_DETECT_MECH]), + nl_attr_get_u32(a[OVS_ELEPHANT_ATTR_DETECT_ARG1]), + nl_attr_get_u32(a[OVS_ELEPHANT_ATTR_DETECT_ARG2]), + nl_attr_get_u8(a[OVS_ELEPHANT_ATTR_DETECT_DSCP])); + + ds_put_cstr(ds, "actions("); + nla_acts = nl_attr_get(a[OVS_ELEPHANT_ATTR_ACTIONS]); + len = nl_attr_get_size(a[OVS_ELEPHANT_ATTR_ACTIONS]); + format_odp_actions(ds, nla_acts, len); + ds_put_format(ds, "))"); +} + static const char * slow_path_reason_to_string(uint32_t reason) { @@ -467,6 +502,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a) case OVS_ACTION_ATTR_SAMPLE: format_odp_sample_action(ds, a); break; + case OVS_ACTION_ATTR_ELEPHANT: + format_odp_elephant_action(ds, a); + break; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: default: @@ -708,6 +746,46 @@ parse_odp_action(const char *s, const struct simap *port_names, } } + { + uint32_t mech, arg1, arg2; + uint8_t dscp; + int n = -1; + + if (ovs_scan(s, "elephant(mech=%"SCNi32",arg1=%"SCNi32 + ",arg2=%"SCNi32",dscp=%"SCNi8",actions(%n", + &mech, &arg1, &arg2, &dscp, &n)) { + size_t elephant_ofs, actions_ofs; + + elephant_ofs = nl_msg_start_nested(actions, + OVS_ACTION_ATTR_ELEPHANT); + nl_msg_put_u32(actions, OVS_ELEPHANT_ATTR_DETECT_MECH, mech); + nl_msg_put_u32(actions, OVS_ELEPHANT_ATTR_DETECT_ARG1, arg1); + nl_msg_put_u32(actions, OVS_ELEPHANT_ATTR_DETECT_ARG2, arg2); + nl_msg_put_u8(actions, OVS_ELEPHANT_ATTR_DETECT_DSCP, dscp); + + actions_ofs = nl_msg_start_nested(actions, + OVS_ELEPHANT_ATTR_ACTIONS); + for (;;) { + int retval; + + n += strspn(s + n, delimiters); + if (s[n] == ')') { + break; + } + + retval = parse_odp_action(s + n, port_names, actions); + if (retval < 0) { + return retval; + } + n += retval; + } + nl_msg_end_nested(actions, actions_ofs); + nl_msg_end_nested(actions, elephant_ofs); + + return s[n + 1] == ')' ? n + 2 : -EINVAL; + } + } + return -EINVAL; } diff --git a/ofproto/automake.mk b/ofproto/automake.mk index 22c50d10a..2f07f6c8d 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -25,6 +25,8 @@ ofproto_libofproto_la_SOURCES = \ ofproto/ofproto.h \ ofproto/ofproto-dpif.c \ ofproto/ofproto-dpif.h \ + ofproto/ofproto-dpif-elephant.c \ + ofproto/ofproto-dpif-elephant.h \ ofproto/ofproto-dpif-ipfix.c \ ofproto/ofproto-dpif-ipfix.h \ ofproto/ofproto-dpif-mirror.c \ diff --git a/ofproto/ofproto-dpif-elephant.c b/ofproto/ofproto-dpif-elephant.c new file mode 100644 index 000000000..d6bfd1df9 --- /dev/null +++ b/ofproto/ofproto-dpif-elephant.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> +#include "ofproto-dpif-elephant.h" +#include "ofproto.h" + +struct dpif_elephant * +dpif_elephant_create(void) +{ + struct dpif_elephant *de; + de = xzalloc(sizeof *de); + ovs_refcount_init(&de->ref_cnt); + return de; +} + +struct dpif_elephant * +dpif_elephant_ref(const struct dpif_elephant *de_) +{ + struct dpif_elephant *de = CONST_CAST(struct dpif_elephant *, de_); + if (de) { + ovs_refcount_ref(&de->ref_cnt); + } + return de; +} + +void +dpif_elephant_unref(struct dpif_elephant *de) +{ + if (de && ovs_refcount_unref(&de->ref_cnt) == 1) { + /* xxx Do we need to blow away the kernel flows? */ + free(de); + } +} + +void +dpif_elephant_set_options(struct dpif_elephant *elephant, uint64_t mech, + uint64_t arg1, uint64_t arg2, int dscp) +{ + /* xxx Do we need to blow away the kernel flows? */ + elephant->mech = mech; + elephant->arg1 = arg1; + elephant->arg2 = arg2; + elephant->dscp = dscp; +} diff --git a/ofproto/ofproto-dpif-elephant.h b/ofproto/ofproto-dpif-elephant.h new file mode 100644 index 000000000..6b7c734e4 --- /dev/null +++ b/ofproto/ofproto-dpif-elephant.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_ELEPHANT_H +#define OFPROTO_ELEPHANT_H 1 + +#include <stdint.h> +#include "ovs-atomic.h" + +/* xxx Do we want to define this here or in the C file? */ +struct dpif_elephant { + uint32_t mech; + uint32_t arg1; + uint32_t arg2; + int dscp; + + struct ovs_refcount ref_cnt; +}; + +struct dpif_elephant *dpif_elephant_create(void); +struct dpif_elephant *dpif_elephant_ref(const struct dpif_elephant *); +void dpif_elephant_unref(struct dpif_elephant *); + +void dpif_elephant_set_options(struct dpif_elephant *, uint64_t mech, + uint64_t arg1, uint64_t arg2, int dscp); + +#endif /* elephant.h */ diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 4aedb5914..129cf378d 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -41,6 +41,7 @@ #include "nx-match.h" #include "odp-execute.h" #include "ofp-actions.h" +#include "ofproto/ofproto-dpif-elephant.h" #include "ofproto/ofproto-dpif-ipfix.h" #include "ofproto/ofproto-dpif-mirror.h" #include "ofproto/ofproto-dpif-monitor.h" @@ -86,6 +87,7 @@ struct xbridge { struct dpif_ipfix *ipfix; /* Ipfix handle, or null. */ struct netflow *netflow; /* Netflow handle, or null. */ struct stp *stp; /* STP or null if disabled. */ + struct dpif_elephant *elephant; /* Elephant flow detection, or null. */ /* Special rules installed by ofproto-dpif. */ struct rule_dpif *miss_rule; @@ -360,6 +362,7 @@ static void xlate_xbridge_set(struct xbridge *xbridge, const struct mac_learning *ml, struct stp *stp, const struct mcast_snooping *ms, const struct mbridge *mbridge, + const struct dpif_elephant *elephant, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, const struct netflow *netflow, @@ -425,6 +428,7 @@ xlate_xbridge_set(struct xbridge *xbridge, const struct mac_learning *ml, struct stp *stp, const struct mcast_snooping *ms, const struct mbridge *mbridge, + const struct dpif_elephant *elephant, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, const struct netflow *netflow, enum ofp_config_flags frag, @@ -468,6 +472,11 @@ xlate_xbridge_set(struct xbridge *xbridge, xbridge->netflow = netflow_ref(netflow); } + if (xbridge->elephant != elephant) { + dpif_elephant_unref(xbridge->elephant); + xbridge->elephant = dpif_elephant_ref(elephant); + } + xbridge->dpif = dpif; xbridge->forward_bpdu = forward_bpdu; xbridge->has_in_band = has_in_band; @@ -548,10 +557,11 @@ xlate_xbridge_copy(struct xbridge *xbridge) xlate_xbridge_set(new_xbridge, xbridge->dpif, xbridge->miss_rule, xbridge->no_packet_in_rule, xbridge->ml, xbridge->stp, - xbridge->ms, xbridge->mbridge, xbridge->sflow, - xbridge->ipfix, xbridge->netflow, xbridge->frag, - xbridge->forward_bpdu, xbridge->has_in_band, - xbridge->enable_recirc, xbridge->variable_length_userdata, + xbridge->ms, xbridge->mbridge, xbridge->elephant, + xbridge->sflow, xbridge->ipfix, + xbridge->netflow, xbridge->frag, xbridge->forward_bpdu, + xbridge->has_in_band, xbridge->enable_recirc, + xbridge->variable_length_userdata, xbridge->max_mpls_depth); LIST_FOR_EACH (xbundle, list_node, &xbridge->xbundles) { xlate_xbundle_copy(new_xbridge, xbundle); @@ -700,6 +710,7 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, const struct mac_learning *ml, struct stp *stp, const struct mcast_snooping *ms, const struct mbridge *mbridge, + const struct dpif_elephant *elephant, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, const struct netflow *netflow, enum ofp_config_flags frag, @@ -724,9 +735,9 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, xbridge->name = xstrdup(name); xlate_xbridge_set(xbridge, dpif, miss_rule, no_packet_in_rule, ml, stp, - ms, mbridge, sflow, ipfix, netflow, frag, forward_bpdu, - has_in_band, enable_recirc, variable_length_userdata, - max_mpls_depth); + ms, mbridge, elephant, sflow, ipfix, netflow, frag, + forward_bpdu, has_in_band, enable_recirc, + variable_length_userdata, max_mpls_depth); } static void @@ -754,6 +765,7 @@ xlate_xbridge_remove(struct xlate_cfg *xcfg, struct xbridge *xbridge) dpif_sflow_unref(xbridge->sflow); dpif_ipfix_unref(xbridge->ipfix); stp_unref(xbridge->stp); + dpif_elephant_unref(xbridge->elephant); hmap_destroy(&xbridge->xports); free(xbridge->name); free(xbridge); @@ -2312,6 +2324,48 @@ fix_sflow_action(struct xlate_ctx *ctx) ctx->sflow_odp_port, ctx->sflow_n_outputs, cookie); } +static void +add_elephant_action(struct xlate_ctx *ctx) +{ + const struct xbridge *xbridge = ctx->xbridge; + struct ofpbuf *odp_actions = &ctx->xout->odp_actions; + size_t elephant_offset, actions_offset; +#if 0 + int cookie_offset; + uint32_t pid; +#endif + + if (!xbridge->elephant) { + return; + } + + elephant_offset = nl_msg_start_nested(odp_actions, + OVS_ACTION_ATTR_ELEPHANT); + + nl_msg_put_u32(odp_actions, OVS_ELEPHANT_ATTR_DETECT_MECH, + xbridge->elephant->mech); + nl_msg_put_u32(odp_actions, OVS_ELEPHANT_ATTR_DETECT_ARG1, + xbridge->elephant->arg1); + nl_msg_put_u32(odp_actions, OVS_ELEPHANT_ATTR_DETECT_ARG2, + xbridge->elephant->arg2); + nl_msg_put_u8(odp_actions, OVS_ELEPHANT_ATTR_DETECT_DSCP, + xbridge->elephant->dscp); + + actions_offset = nl_msg_start_nested(odp_actions, + OVS_ELEPHANT_ATTR_ACTIONS); + +#if 0 + odp_port = ofp_port_to_odp_port(xbridge, flow->in_port.ofp_port); + pid = dpif_port_get_pid(xbridge->dpif, odp_port, + flow_hash_5tuple(flow, 0)); + cookie_offset = odp_put_userspace_action(pid, cookie, cookie_size, + odp_actions); +#endif + + nl_msg_end_nested(odp_actions, actions_offset); + nl_msg_end_nested(odp_actions, elephant_offset); +} + static enum slow_path_reason process_special(struct xlate_ctx *ctx, const struct flow *flow, const struct xport *xport, const struct ofpbuf *packet) @@ -4084,6 +4138,8 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) add_ipfix_action(&ctx); sample_actions_len = ofpbuf_size(&ctx.xout->odp_actions); + add_elephant_action(&ctx); + if (tnl_may_send && (!in_port || may_receive(in_port, &ctx))) { do_xlate_actions(ofpacts, ofpacts_len, &ctx); diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index 4bdf2d3e8..ffb6018d9 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -28,6 +28,7 @@ struct bfd; struct bond; struct dpif; struct lacp; +struct dpif_elephant; struct dpif_ipfix; struct dpif_sflow; struct mac_learning; @@ -142,8 +143,9 @@ void xlate_ofproto_set(struct ofproto_dpif *, const char *name, struct rule_dpif *no_packet_in_rule, const struct mac_learning *, struct stp *, const struct mcast_snooping *, - const struct mbridge *, const struct dpif_sflow *, - const struct dpif_ipfix *, const struct netflow *, + const struct mbridge *, const struct dpif_elephant *, + const struct dpif_sflow *, const struct dpif_ipfix *, + const struct netflow *, enum ofp_config_flags, bool forward_bpdu, bool has_in_band, bool enable_recirc, bool variable_length_userdata, diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 980b04f17..d27607df9 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -51,6 +51,7 @@ #include "ofp-actions.h" #include "ofp-parse.h" #include "ofp-print.h" +#include "ofproto-dpif-elephant.h" #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-mirror.h" #include "ofproto-dpif-monitor.h" @@ -293,6 +294,7 @@ struct ofproto_dpif { bool has_bonded_bundles; bool lacp_enabled; struct mbridge *mbridge; + struct dpif_elephant *elephant; struct ovs_mutex stats_mutex; struct netdev_stats stats OVS_GUARDED; /* To account packets generated and @@ -596,6 +598,7 @@ type_run(const char *type) ofproto->backer->dpif, ofproto->miss_rule, ofproto->no_packet_in_rule, ofproto->ml, ofproto->stp, ofproto->ms, ofproto->mbridge, + ofproto->elephant, ofproto->sflow, ofproto->ipfix, ofproto->netflow, ofproto->up.frag_handling, ofproto->up.forward_bpdu, @@ -1133,6 +1136,7 @@ construct(struct ofproto *ofproto_) ofproto->sflow = NULL; ofproto->ipfix = NULL; ofproto->stp = NULL; + ofproto->elephant = NULL; ofproto->dump_seq = 0; hmap_init(&ofproto->bundles); ofproto->ml = mac_learning_create(MAC_ENTRY_DEFAULT_IDLE_TIME); @@ -3807,6 +3811,41 @@ packet_out(struct ofproto *ofproto_, struct ofpbuf *packet, ofpacts_len, packet); return 0; } + +/* Elephants. */ + +static int +set_elephant(struct ofproto *ofproto_, uint64_t mech, uint64_t arg1, + uint64_t arg2, int dscp) +{ + struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); + struct dpif_elephant *de = ofproto->elephant; + + if (mech && !de) { + de = ofproto->elephant = dpif_elephant_create(); + } + + if (de) { + dpif_elephant_set_options(de, mech, arg1, arg2, dscp); + + if (!mech) { + dpif_elephant_unref(de); + ofproto->elephant = NULL; + } + } + + return 0; +} + +static int +get_elephants(struct ofproto *ofproto_ OVS_UNUSED, struct smap *elephants) +{ + smap_init(elephants); + + /* xxx Figure out how to do this. */ + + return 0; +} /* NetFlow. */ @@ -5100,6 +5139,8 @@ const struct ofproto_class ofproto_dpif_class = { rule_modify_actions, set_frag_handling, packet_out, + set_elephant, + get_elephants, set_netflow, get_netflow_ids, set_sflow, diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index 2f150b5e1..3a0c11d51 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -157,6 +157,8 @@ struct rule_dpif *ofproto_dpif_refresh_rule(struct rule_dpif *); struct ofport_dpif *odp_port_to_ofport(const struct dpif_backer *, odp_port_t); +int ofproto_get_elephant_dscp(struct ofproto_dpif *); + /* * Recirculation * ============= diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 7e6e99bcf..f2ff2c5aa 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -1301,6 +1301,20 @@ struct ofproto_class { const struct ofpact *ofpacts, size_t ofpacts_len); + /* xxx Update this explanation. */ + + /* Configure detecting and handling of elephant flows. A flow is + * considered an elephant when it reaches 'bytes' for 'msecs' + * milliseconds. Once a flow is determined to be an elephant, it is + * never considered a mouse again. If 'dscp' is not -1, then that + * value will be used for the DSCP of packets placed on the wire. */ + int (*set_elephant)(struct ofproto *ofproto, uint64_t mech, + uint64_t arg1, uint64_t arg2, int dscp); + + /* Populates "elephants" with the key a string representation of the + * flow and the value with the byte count. */ + int (*get_elephants)(struct ofproto *ofproto, struct smap *elephants); + /* ## ------------------------- ## */ /* ## OFPP_NORMAL configuration ## */ /* ## ------------------------- ## */ diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index fca7d09e7..072da32fd 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -780,6 +780,25 @@ ofproto_set_snoops(struct ofproto *ofproto, const struct sset *snoops) return connmgr_set_snoops(ofproto->connmgr, snoops); } +/* xxx Move to better location and explain. */ +int +ofproto_set_elephant(struct ofproto *ofproto, uint64_t mech, + uint64_t arg1, uint64_t arg2, int dscp) +{ + return (ofproto->ofproto_class->set_elephant + ? ofproto->ofproto_class->set_elephant(ofproto, mech, arg1, + arg2, dscp) + : EOPNOTSUPP); +} + +int +ofproto_get_elephants(struct ofproto *ofproto, struct smap *elephants) +{ + return (ofproto->ofproto_class->get_elephants + ? ofproto->ofproto_class->get_elephants(ofproto, elephants) + : EOPNOTSUPP); +} + int ofproto_set_netflow(struct ofproto *ofproto, const struct netflow_options *nf_options) diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index c71662ebf..39c5ce653 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -256,6 +256,9 @@ int ofproto_port_set_mcast_snooping(struct ofproto *ofproto, void *aux, void ofproto_set_threads(int n_handlers, int n_revalidators); void ofproto_set_dp_desc(struct ofproto *, const char *dp_desc); int ofproto_set_snoops(struct ofproto *, const struct sset *snoops); +int ofproto_set_elephant(struct ofproto *, uint64_t mech, uint64_t arg1, + uint64_t arg2, int dscp); +int ofproto_get_elephants(struct ofproto *, struct smap *elephants); int ofproto_set_netflow(struct ofproto *, const struct netflow_options *nf_options); int ofproto_set_sflow(struct ofproto *, const struct ofproto_sflow_options *); diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 6dcc2b878..f85f9b740 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -228,6 +228,7 @@ static void bridge_configure_mcast_snooping(struct bridge *); static void bridge_configure_sflow(struct bridge *, int *sflow_bridge_number); static void bridge_configure_ipfix(struct bridge *); static void bridge_configure_stp(struct bridge *); +static void bridge_configure_elephant(struct bridge *); static void bridge_configure_tables(struct bridge *); static void bridge_configure_dp_desc(struct bridge *); static void bridge_configure_remotes(struct bridge *, @@ -266,6 +267,8 @@ static void mirror_destroy(struct mirror *); static bool mirror_configure(struct mirror *); static void mirror_refresh_stats(struct mirror *); +static void elephants_refresh_stats(struct bridge *); + static void iface_configure_lacp(struct iface *, struct lacp_slave_settings *); static bool iface_create(struct bridge *, const struct ovsrec_interface *, const struct ovsrec_port *); @@ -378,6 +381,7 @@ bridge_init(const char *remote) ovsdb_idl_omit_alert(idl, &ovsrec_bridge_col_datapath_id); ovsdb_idl_omit_alert(idl, &ovsrec_bridge_col_status); + ovsdb_idl_omit_alert(idl, &ovsrec_bridge_col_elephant_flows); ovsdb_idl_omit(idl, &ovsrec_bridge_col_external_ids); ovsdb_idl_omit_alert(idl, &ovsrec_port_col_status); @@ -622,6 +626,7 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) bridge_configure_sflow(br, &sflow_bridge_number); bridge_configure_ipfix(br); bridge_configure_stp(br); + bridge_configure_elephant(br); bridge_configure_tables(br); bridge_configure_dp_desc(br); } @@ -1383,6 +1388,29 @@ bridge_configure_stp(struct bridge *br) } } +/* Set elephant flow detection configuration on 'br'. */ +static void +bridge_configure_elephant(struct bridge *br) +{ + const char *config_str; + uint64_t mech, arg1, arg2; + int dscp; + + config_str = smap_get(&br->cfg->other_config, "elephant-mech"); + mech = config_str ? strtoul(config_str, NULL, 0) : 0; + + config_str = smap_get(&br->cfg->other_config, "elephant-arg1"); + arg1 = config_str ? strtoul(config_str, NULL, 0) : 0; + + config_str = smap_get(&br->cfg->other_config, "elephant-arg2"); + arg2 = config_str ? strtoul(config_str, NULL, 0) : 0; + + config_str = smap_get(&br->cfg->other_config, "elephant-dscp"); + dscp = config_str ? strtoul(config_str, NULL, 0) : -1; + + ofproto_set_elephant(br->ofproto, mech, arg1, arg2, dscp); +} + static bool bridge_has_bond_fake_iface(const struct bridge *br, const char *name) { @@ -2459,6 +2487,8 @@ bridge_run(void) mirror_refresh_stats(m); } + /* xxx Find better place for this, but useful timer. */ + elephants_refresh_stats(br); } refresh_controller_status(); ovsdb_idl_txn_commit(txn); @@ -4337,3 +4367,15 @@ mirror_refresh_stats(struct mirror *m) ovsrec_mirror_set_statistics(m->cfg, keys, values, stat_cnt); } + +static void +elephants_refresh_stats(struct bridge *br) +{ + struct smap elephants; + + if (!ofproto_get_elephants(br->ofproto, &elephants)) { + ovsrec_bridge_set_elephant_flows(br->cfg, &elephants); + } + + smap_destroy(&elephants); +} diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index bc9ea73c6..717e61506 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "7.8.0", - "cksum": "2676751133 20740", + "version": "7.78.0", + "cksum": "904547257 20882", "tables": { "Open_vSwitch": { "columns": { @@ -93,6 +93,9 @@ "type": {"key": {"type": "string", "enum": ["set", ["standalone", "secure"]]}, "min": 0, "max": 1}}, + "elephant_flows": { + "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}, + "ephemeral": true}, "status": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}, "ephemeral": true}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index d47fc1a6e..18cf7db35 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -480,6 +480,75 @@ </column> </group> + <group title="Elephant Flow Detection"> + <p> + The conventional wisdom is that majority of flows in the + datacenter are short (mice), yet the majority of packets belong to + a few long-lived flows (elephants). Mice are often associated + with bursty, latency sensitive apps whereas elephants tend to be + large transfers in which throughput is far more important than + latency. + </p> + + <p> + Open vSwitch has the ability to distinguish elephant flows and + identify them through the database or mark them. Elephants are + identified by the combination of their duration and number of + bytes. + </p> + + <column name="other_config" key="elephant-mech"> + The mechanism used to detect elephants. It may be set to one of + the following: + <dl> + <dt><code>0</code></dt> + <dd>Disable elephant detection.</dd> + + <dt><code>1</code></dt> + <dd>Bytes. A flow is declared an elephant based on a + threshold of the total number of bytes sent. <ref + column="other_config" key="elephant-arg1"/> specifies the + number of bytes. <ref column="other_config" + key="elephant-arg2"/> specifies the number of seconds that the + flow must exist before being declared an elephant; set to + <code>0</code> for immediate detection when the byte threshold + is crossed.</dd> + + <dt><code>2</code></dt> + <dd>TSO. A flow is declared an elephant based on the segment + size. <ref column="other_config" key="elephant-arg1"/> + specifies the size of the segment. <ref column="other_config" + key="elephant-arg2"/> specifies the required number of + segments of the configured size that must be seen. + </dd> + </dl> + </column> + + <column name="other_config" key="elephant-arg1"> + First argument to elephant mechanism. The arguments are + described in <ref column="other_config" key="elephant-mech"/>. + </column> + + <column name="other_config" key="elephant-arg2"> + Second argument to elephant mechanism. The arguments are + described in <ref column="other_config" key="elephant-mech"/>. + </column> + + <column name="other_config" key="elephant-dscp"> + When an elephant flow is detected sets the DSCP value to use on + the wire for packets matching that flow. Not setting the key or + using a value of -1 will leave elephant flows unmarked. + </column> + + <column name="elephant_flows"> + <code>xxx Not currently supported xxx</code> + The <code>elephants</code> column contains key-value pairs that + report elephant flows. The key is a description of the flow, + and the value is size of the flow in bytes. These are updated + periodically (currently, every 5 seconds). + </column> + </group> + <group title="OpenFlow Configuration"> <column name="controller"> <p> |