MEDIUM: listener: add the "shards" bind keyword20211012-bind-shards-1

In multi-threaded mode, on operating systems supporting multiple listeners on the same IP:port, this will automatically create this number of multiple identical listeners for the same line, all bound to a fair share of the number of the threads attached to this listener. This can sometimes be useful when using very large thread counts where the in-kernel locking on a single socket starts to cause a significant overhead. In this case the incoming traffic is distributed over multiple sockets and the contention is reduced. Note that doing this can easily increase the CPU usage by making more threads work a little bit. If the number of shards is higher than the number of available threads, it will automatically be trimmed to the number of threads. A special value "by-thread" will automatically assign one shard per thread.
author: Willy Tarreau <w@1wt.eu> 2021-10-12 15:23:03 +0200
committer: Willy Tarreau <w@1wt.eu> 2021-10-12 15:37:48 +0200
commit: 55cc2ad7f18b781e6cd89af69f6aff05b46aec36 (patch)
tree: 26d08d7e26223c8d378468407e20539c265302ad
parent: 08dc93715a6256e0e2ffbc45c3481f6570d577e1 (diff)
download: haproxy-20211012-bind-shards-1.tar.gz
4 files changed, 93 insertions, 2 deletions
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 257205175..11bbc6d3c 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -14079,6 +14079,24 @@ proto <name>
   instance, it is possible to force the http/2 on clear TCP by specifying "proto
   h2" on the bind line.
 
+shards <number> | by-thread
+  In multi-threaded mode, on operating systems supporting multiple listeners on
+  the same IP:port, this will automatically create this number of multiple
+  identical listeners for the same line, all bound to a fair share of the number
+  of the threads attached to this listener. This can sometimes be useful when
+  using very large thread counts where the in-kernel locking on a single socket
+  starts to cause a significant overhead. In this case the incoming traffic is
+  distributed over multiple sockets and the contention is reduced. Note that
+  doing this can easily increase the CPU usage by making more threads work a
+  little bit.
+
+  If the number of shards is higher than the number of available threads, it
+  will automatically be trimmed to the number of threads (i.e. one shard per
+  thread). The special "by-thread" value also creates as many shards as there
+  are threads on the "bind" line. Since the system will evenly distribute the
+  incoming traffic between all these shards, it is important that this number
+  is an integral divisor of the number of threads.
+
 ssl
   This setting is only available when support for OpenSSL was built in. It
   enables SSL deciphering on connections instantiated from this listener. A
diff --git a/include/haproxy/receiver-t.h b/include/haproxy/receiver-t.h
index 481ac6443..3e1ff81db 100644
--- a/include/haproxy/receiver-t.h
+++ b/include/haproxy/receiver-t.h
@@ -49,6 +49,7 @@ struct rx_settings {
 	char *interface;                  /* interface name or NULL */
 	const struct netns_entry *netns;  /* network namespace of the listener*/
 	unsigned int options;             /* receiver options (RX_O_*) */
+	uint shards;                      /* number of shards */
 };
 
 /* This describes a receiver with all its characteristics (address, options, etc) */
diff --git a/src/cfgparse.c b/src/cfgparse.c
index ada344d17..173dcdb32 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -2566,8 +2566,54 @@ int check_config_validity()
 
 			/* apply thread masks and groups to all receivers */
 			list_for_each_entry(li, &bind_conf->listeners, by_bind) {
-				li->rx.bind_thread = bind_conf->bind_thread;
-				li->rx.bind_tgroup = bind_conf->bind_tgroup;
+				if (bind_conf->settings.shards <= 1) {
+					li->rx.bind_thread = bind_conf->bind_thread;
+					li->rx.bind_tgroup = bind_conf->bind_tgroup;
+				} else {
+					struct listener *new_li;
+					int shard, shards, todo, done, bit;
+					ulong mask;
+
+					shards = bind_conf->settings.shards;
+					todo = my_popcountl(bind_conf->bind_thread);
+
+					/* no more shards than total threads */
+					if (shards > todo)
+						shards = todo;
+
+					shard = done = bit = 0;
+					new_li = li;
+
+					while (1) {
+						mask = 0;
+						while (done < todo) {
+							/* enlarge mask to cover next bit of bind_thread */
+							while (!(bind_conf->bind_thread & (1UL << bit)))
+								bit++;
+							mask |= (1UL << bit);
+							bit++;
+							done += shards;
+						}
+
+						new_li->rx.bind_thread = bind_conf->bind_thread & mask;
+						new_li->rx.bind_tgroup = bind_conf->bind_tgroup;
+						done -= todo;
+
+						shard++;
+						if (shard >= shards)
+							break;
+
+						/* create another listener for new shards */
+						new_li = clone_listener(li);
+						if (!new_li) {
+							ha_alert("Out of memory while trying to allocate extra listener for shard %d in %s %s\n",
+								 shard, proxy_type_str(curproxy), curproxy->id);
+							cfgerr++;
+							err_code |= ERR_FATAL | ERR_ALERT;
+							goto out;
+						}
+					}
+				}
 			}
 		}
 
diff --git a/src/listener.c b/src/listener.c
index 044d22390..f16ba2d0a 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -1360,6 +1360,7 @@ struct bind_conf *bind_conf_alloc(struct proxy *fe, const char *file,
 	bind_conf->settings.ux.uid = -1;
 	bind_conf->settings.ux.gid = -1;
 	bind_conf->settings.ux.mode = 0;
+	bind_conf->settings.shards = 1;
 	bind_conf->xprt = xprt;
 	bind_conf->frontend = fe;
 	bind_conf->severity_output = CLI_SEVERITY_NONE;
@@ -1639,6 +1640,30 @@ static int bind_parse_proto(char **args, int cur_arg, struct proxy *px, struct b
 	return 0;
 }
 
+/* parse the "shards" bind keyword. Takes an integer or "by-thread" */
+static int bind_parse_shards(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err)
+{
+	int val;
+
+	if (!*args[cur_arg + 1]) {
+		memprintf(err, "'%s' : missing value", args[cur_arg]);
+		return ERR_ALERT | ERR_FATAL;
+	}
+
+	if (strcmp(args[cur_arg + 1], "by-thread") == 0) {
+		val = MAX_THREADS; /* will be trimmed later anyway */
+	} else {
+		val = atol(args[cur_arg + 1]);
+		if (val < 1 || val > MAX_THREADS) {
+			memprintf(err, "'%s' : invalid value %d, allowed range is %d..%d or 'by-thread'", args[cur_arg], val, 1, MAX_THREADS);
+			return ERR_ALERT | ERR_FATAL;
+		}
+	}
+
+	conf->settings.shards = val;
+	return 0;
+}
+
 /* parse the "thread" bind keyword */
 static int bind_parse_thread(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err)
 {
@@ -1734,6 +1759,7 @@ static struct bind_kw_list bind_kws = { "ALL", { }, {
 	{ "nice",         bind_parse_nice,         1 }, /* set nice of listening socket */
 	{ "process",      bind_parse_process,      1 }, /* set list of allowed process for this socket */
 	{ "proto",        bind_parse_proto,        1 }, /* set the proto to use for all incoming connections */
+	{ "shards",       bind_parse_shards,       1 }, /* set number of shards */
 	{ "thread",       bind_parse_thread,       1 }, /* set list of allowed threads for this socket */
 	{ /* END */ },
 }};
author	Willy Tarreau <w@1wt.eu>	2021-10-12 15:23:03 +0200
committer	Willy Tarreau <w@1wt.eu>	2021-10-12 15:37:48 +0200
commit	55cc2ad7f18b781e6cd89af69f6aff05b46aec36 (patch)
tree	26d08d7e26223c8d378468407e20539c265302ad
parent	08dc93715a6256e0e2ffbc45c3481f6570d577e1 (diff)
download	haproxy-20211012-bind-shards-1.tar.gz