summaryrefslogtreecommitdiff
path: root/src/n-acd.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/n-acd.c')
-rw-r--r--src/n-acd.c1213
1 files changed, 1213 insertions, 0 deletions
diff --git a/src/n-acd.c b/src/n-acd.c
new file mode 100644
index 0000000000..4f8023e896
--- /dev/null
+++ b/src/n-acd.c
@@ -0,0 +1,1213 @@
+/*
+ * IPv4 Address Conflict Detection
+ *
+ * This implements the main n-acd API. It is built around an epoll-fd to
+ * encapsulate a timerfd+socket. The n-acd context has quite straightforward
+ * lifetime rules. The parameters must be set when the engine is started, and
+ * they can only be changed by stopping and restartding the engine. The engine
+ * is started on demand and stopped when no longer needed.
+ * During the entire lifetime the context can be dispatched. That is, the
+ * dispatcher does not have to be aware of the context state. After each call
+ * to dispatch(), the caller must pop all pending events until -EAGAIN is
+ * returned.
+ *
+ * If a conflict is detected, the ACD engine reports to the caller and stops
+ * the engine. The caller can now modify parameters and restart the engine, if
+ * required.
+ */
+
+#include <assert.h>
+#include <c-list.h>
+#include <c-siphash.h>
+#include <endian.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/timerfd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "n-acd.h"
+
+#define _public_ __attribute__((__visibility__("default")))
+
+/*
+ * These parameters and timing intervals specified in RFC-5227. The original
+ * values are:
+ *
+ * PROBE_NUM 3
+ * PROBE_WAIT 1s
+ * PROBE_MIN 1s
+ * PROBE_MAX 3s
+ * ANNOUNCE_NUM 3
+ * ANNOUNCE_WAIT 2s
+ * ANNOUNCE_INTERVAL 2s
+ * MAX_CONFLICTS 10
+ * RATE_LIMIT_INTERVAL 60s
+ * DEFEND_INTERVAL 10s
+ *
+ * If we assume a best-case and worst-case scenario for non-conflicted runs, we
+ * end up with a runtime between 4s and 9s to finish the probe. Then it still
+ * takes a fixed 4s to finish the announcements.
+ *
+ * RFC 5227 section 1.1:
+ * [...] (Note that the values listed here are fixed constants; they are
+ * not intended to be modifiable by implementers, operators, or end users.
+ * These constants are given symbolic names here to facilitate the writing
+ * of future standards that may want to reference this document with
+ * different values for these named constants; however, at the present time
+ * no such future standards exist.) [...]
+ *
+ * Unfortunately, no-one ever stepped up to write a "future standard" to revise
+ * the timings. A 9s timeout for successful link setups is not acceptable today.
+ * Hence, we will just go forward and ignore the proposed values. On both
+ * wired and wireless local links round-trip latencies of below 3ms are common,
+ * while latencies above 10ms are rarely seen. We require the caller to set a
+ * timeout multiplier, where 1 corresponds to a total probe time of 0.5 ms and
+ * 1.0 ms. On modern networks a multiplier of about 100 should be a reasonable
+ * default. To comply with the RFC select a multiplier of 9000.
+ */
+#define N_ACD_RFC_PROBE_NUM (3)
+#define N_ACD_RFC_PROBE_WAIT_USEC (UINT64_C(111)) /* 111us */
+#define N_ACD_RFC_PROBE_MIN_USEC (UINT64_C(111)) /* 111us */
+#define N_ACD_RFC_PROBE_MAX_USEC (UINT64_C(333)) /* 333us */
+#define N_ACD_RFC_ANNOUNCE_NUM (3)
+#define N_ACD_RFC_ANNOUNCE_WAIT_USEC (UINT64_C(222)) /* 222us */
+#define N_ACD_RFC_ANNOUNCE_INTERVAL_USEC (UINT64_C(222)) /* 222us */
+#define N_ACD_RFC_MAX_CONFLICTS (10)
+#define N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC (UINT64_C(60000000)) /* 60s */
+#define N_ACD_RFC_DEFEND_INTERVAL_USEC (UINT64_C(10000000)) /* 10s */
+
+/*
+ * Fake ENETDOWN error-code. We use this as replacement for known EFOOBAR error
+ * codes. It is explicitly chosen to be outside the known error-code range.
+ * Whenever we are deep down in a call-stack and notice a ENETDOWN error, we
+ * return this instead. It is caught by the top-level dispatcher and then
+ * properly handled.
+ * This avoids gracefully handling ENETDOWN in call-stacks, but then continuing
+ * with some work in the callers without noticing the soft failure.
+ */
+#define N_ACD_E_DOWN (INT_MAX)
+
+#define TIME_INFINITY ((uint64_t) -1)
+
+enum {
+ N_ACD_EPOLL_TIMER,
+ N_ACD_EPOLL_SOCKET,
+};
+
+enum {
+ N_ACD_STATE_INIT,
+ N_ACD_STATE_PROBING,
+ N_ACD_STATE_CONFIGURING,
+ N_ACD_STATE_ANNOUNCING,
+};
+
+typedef struct NAcdEventNode {
+ NAcdEvent event;
+ uint8_t sender[ETH_ALEN];
+ CList link;
+} NAcdEventNode;
+
+struct NAcd {
+ /* context */
+ unsigned int seed;
+ int fd_epoll;
+ int fd_timer;
+
+ /* configuration */
+ NAcdConfig config;
+ uint8_t mac[ETH_ALEN];
+ uint64_t timeout_multiplier;
+
+ /* runtime */
+ int fd_socket;
+ unsigned int state;
+ unsigned int n_iteration;
+ unsigned int n_conflicts;
+ unsigned int defend;
+ uint64_t last_defend;
+ uint64_t last_conflict;
+
+ /* pending events */
+ CList events;
+ NAcdEventNode *current;
+};
+
+static int n_acd_errno(void) {
+ /*
+ * Compilers continuously warn about uninitialized variables since they
+ * cannot deduce that `return -errno;` will always be negative. This
+ * small wrapper makes sure compilers figure that out. Use it as
+ * replacement for `errno` read access. Yes, it generates worse code,
+ * but only marginally and only affects slow-paths.
+ */
+ return abs(errno) ? : EIO;
+}
+
+static int n_acd_event_node_new(NAcdEventNode **nodep, unsigned int event) {
+ NAcdEventNode *node;
+
+ node = calloc(1, sizeof(*node));
+ if (!node)
+ return -ENOMEM;
+
+ node->event.event = event;
+ node->link = (CList)C_LIST_INIT(node->link);
+
+ *nodep = node;
+
+ return 0;
+}
+
+static NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node) {
+ if (!node)
+ return NULL;
+
+ c_list_unlink(&node->link);
+ free(node);
+
+ return NULL;
+}
+
+static int n_acd_get_random(unsigned int *random) {
+ uint8_t hash_seed[] = { 0x3a, 0x0c, 0xa6, 0xdd, 0x44, 0xef, 0x5f, 0x7a, 0x5e, 0xd7, 0x25, 0x37, 0xbf, 0x4e, 0x80, 0xa1 };
+ CSipHash hash = C_SIPHASH_NULL;
+ struct timespec ts;
+ const uint8_t *p;
+ int r;
+
+ /*
+ * We need random jitter for all timeouts when handling ARP probes. Use
+ * AT_RANDOM to get a seed for rand_r(3p), if available (should always
+ * be available on linux). See the time-out scheduler for details.
+ * Additionally, we include the current time in the seed. This avoids
+ * using the same jitter in case you run multiple ACD engines in the
+ * same process. Lastly, the seed is hashed with SipHash24 to avoid
+ * exposing the value of AT_RANDOM on the network.
+ */
+ c_siphash_init(&hash, hash_seed);
+
+ p = (const uint8_t *)getauxval(AT_RANDOM);
+ if (p)
+ c_siphash_append(&hash, p, 16);
+
+ r = clock_gettime(CLOCK_BOOTTIME, &ts);
+ if (r < 0)
+ return -n_acd_errno();
+
+ c_siphash_append(&hash, (const uint8_t *)&ts.tv_sec, sizeof(ts.tv_sec));
+ c_siphash_append(&hash, (const uint8_t *)&ts.tv_nsec, sizeof(ts.tv_nsec));
+
+ *random = c_siphash_finalize(&hash);
+ return 0;
+}
+
+/**
+ * n_acd_new() - create a new ACD context
+ * @acdp: output argument for context
+ *
+ * Create a new ACD context and return it in @acdp.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+_public_ int n_acd_new(NAcd **acdp) {
+ NAcd *acd;
+ int r;
+
+ acd = calloc(1, sizeof(*acd));
+ if (!acd)
+ return -ENOMEM;
+
+ acd->fd_epoll = -1;
+ acd->fd_timer = -1;
+ acd->fd_socket = -1;
+ acd->state = N_ACD_STATE_INIT;
+ acd->defend = N_ACD_DEFEND_NEVER;
+ acd->events = (CList)C_LIST_INIT(acd->events);
+ acd->last_conflict = TIME_INFINITY;
+
+ r = n_acd_get_random(&acd->seed);
+ if (r < 0)
+ return r;
+
+ acd->fd_epoll = epoll_create1(EPOLL_CLOEXEC);
+ if (acd->fd_epoll < 0) {
+ r = -n_acd_errno();
+ goto error;
+ }
+
+ acd->fd_timer = timerfd_create(CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK);
+ if (acd->fd_timer < 0) {
+ r = -n_acd_errno();
+ goto error;
+ }
+
+ r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->fd_timer,
+ &(struct epoll_event){
+ .events = EPOLLIN,
+ .data.u32 = N_ACD_EPOLL_TIMER,
+ });
+ if (r < 0) {
+ r = -n_acd_errno();
+ goto error;
+ }
+
+ *acdp = acd;
+ return 0;
+
+error:
+ n_acd_free(acd);
+ return r;
+}
+
+/**
+ * n_acd_free() - free an ACD context
+ *
+ * Frees all resources held by the context. This may be called at any time,
+ * but doing so invalidates all data owned by the context.
+ *
+ * Return: NULL.
+ */
+_public_ NAcd *n_acd_free(NAcd *acd) {
+ NAcdEventNode *node;
+
+ if (!acd)
+ return NULL;
+
+ n_acd_stop(acd);
+
+ while ((node = c_list_first_entry(&acd->events, NAcdEventNode, link)))
+ n_acd_event_node_free(node);
+
+ assert(acd->fd_socket < 0);
+
+ if (acd->fd_timer >= 0) {
+ assert(acd->fd_epoll >= 0);
+ epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_timer, NULL);
+ close(acd->fd_timer);
+ acd->fd_timer = -1;
+ }
+
+ if (acd->fd_epoll >= 0) {
+ close(acd->fd_epoll);
+ acd->fd_epoll = -1;
+ }
+
+ free(acd);
+
+ return NULL;
+}
+
+/**
+ * n_acd_get_fd() - get pollable file descriptor
+ * @acd: ACD context
+ * @fdp: output argument for file descriptor
+ *
+ * Returns a file descriptor in @fdp. This filedescriptor can be polled by
+ * the caller to indicate when the ACD context can be dispatched.
+ */
+_public_ void n_acd_get_fd(NAcd *acd, int *fdp) {
+ *fdp = acd->fd_epoll;
+}
+
+static int n_acd_push_event(NAcd *acd, unsigned int event, uint16_t *operation, uint8_t (*sender)[6], uint8_t (*target)[4]) {
+ NAcdEventNode *node;
+ int r;
+
+ r = n_acd_event_node_new(&node, event);
+ if (r < 0)
+ return r;
+
+ switch (event) {
+ case N_ACD_EVENT_USED:
+ node->event.used.operation = be16toh(*operation);
+ memcpy(node->sender, sender, sizeof(node->sender));
+ node->event.used.sender = node->sender;
+ node->event.used.n_sender = sizeof(node->sender);
+ memcpy(&node->event.used.target, target, sizeof(node->event.used.target));
+ break;
+ case N_ACD_EVENT_CONFLICT:
+ node->event.conflict.operation = be16toh(*operation);
+ memcpy(node->sender, sender, sizeof(node->sender));
+ node->event.used.sender = node->sender;
+ node->event.used.n_sender = sizeof(node->sender);
+ memcpy(&node->event.conflict.target, target, sizeof(node->event.conflict.target));
+ break;
+ case N_ACD_EVENT_DEFENDED:
+ node->event.defended.operation = be16toh(*operation);
+ memcpy(node->sender, sender, sizeof(node->sender));
+ node->event.used.sender = node->sender;
+ node->event.used.n_sender = sizeof(node->sender);
+ memcpy(&node->event.defended.target, target, sizeof(node->event.defended.target));
+ break;
+ case N_ACD_EVENT_READY:
+ case N_ACD_EVENT_DOWN:
+ break;
+ default:
+ assert(0);
+ }
+
+ c_list_link_tail(&acd->events, &node->link);
+
+ return 0;
+}
+
+static int n_acd_now(uint64_t *nowp) {
+ struct timespec ts;
+ int r;
+
+ r = clock_gettime(CLOCK_BOOTTIME, &ts);
+ if (r < 0)
+ return -n_acd_errno();
+
+ *nowp = ts.tv_sec * UINT64_C(1000000) + ts.tv_nsec / UINT64_C(1000);
+ return 0;
+}
+
+static int n_acd_schedule(NAcd *acd, uint64_t u_timeout, unsigned int u_jitter) {
+ uint64_t u_next = u_timeout;
+ int r;
+
+ /*
+ * ACD specifies jitter values to reduce packet storms on the local
+ * link. This call accepts the maximum relative jitter value in
+ * microseconds as @u_jitter. We then use rand_r(3p) to get a
+ * pseudo-random jitter on top of the real timeout given as @u_timeout.
+ * Note that rand_r() is fine for this. Before you try to improve the
+ * RNG, you better spend some time securing ARP.
+ */
+ if (u_jitter)
+ u_next += rand_r(&acd->seed) % u_jitter;
+
+ /*
+ * Setting .it_value to 0 in timerfd_settime() disarms the timer. Avoid
+ * this and always schedule at least 1us. Otherwise, we'd have to
+ * recursively call into the time-out handler, which we really want to
+ * avoid. No reason to optimize performance here.
+ */
+ if (!u_next)
+ u_next = 1;
+
+ r = timerfd_settime(acd->fd_timer, 0,
+ &(struct itimerspec){ .it_value = {
+ .tv_sec = u_next / UINT64_C(1000000),
+ .tv_nsec = u_next % UINT64_C(1000000) * UINT64_C(1000),
+ } }, NULL);
+ if (r < 0)
+ return -n_acd_errno();
+
+ return 0;
+}
+
+static int n_acd_send(NAcd *acd, const struct in_addr *spa) {
+ struct sockaddr_ll address = {
+ .sll_family = AF_PACKET,
+ .sll_protocol = htobe16(ETH_P_ARP),
+ .sll_ifindex = acd->config.ifindex,
+ .sll_halen = ETH_ALEN,
+ .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ };
+ struct ether_arp arp = {
+ .ea_hdr.ar_hrd = htobe16(ARPHRD_ETHER),
+ .ea_hdr.ar_pro = htobe16(ETHERTYPE_IP),
+ .ea_hdr.ar_hln = sizeof(acd->mac),
+ .ea_hdr.ar_pln = sizeof(uint32_t),
+ .ea_hdr.ar_op = htobe16(ARPOP_REQUEST),
+ };
+ ssize_t l;
+
+ memcpy(arp.arp_sha, acd->mac, sizeof(acd->mac));
+ memcpy(arp.arp_tpa, &acd->config.ip.s_addr, sizeof(uint32_t));
+
+ if (spa)
+ memcpy(arp.arp_spa, &spa->s_addr, sizeof(spa->s_addr));
+
+ l = sendto(acd->fd_socket, &arp, sizeof(arp), MSG_NOSIGNAL, (struct sockaddr *)&address, sizeof(address));
+ if (l == (ssize_t)sizeof(arp)) {
+ /* Packet was properly sent. */
+ return 0;
+ } else if (l >= 0) {
+ /*
+ * Ugh. The packet was truncated. This should not happen, but
+ * lets just pretend the packet was dropped.
+ */
+ return 0;
+ } else if (errno == EAGAIN || errno == ENOBUFS) {
+ /*
+ * In case the output buffer is full, the packet is silently
+ * dropped. This is just as if the physical layer happened to
+ * drop the packet. We are not on a reliable medium, so no
+ * reason to pretend we are.
+ */
+ return 0;
+ } else if (errno == ENETDOWN || errno == ENXIO) {
+ /*
+ * We get ENETDOWN if the network-device goes down or is
+ * removed. ENXIO might happen on async send-operations if the
+ * network-device was unplugged and thus the kernel is no
+ * longer aware of it.
+ * In any case, we do not allow proceeding with this socket. We
+ * stop the engine and notify the user gracefully.
+ */
+ return -N_ACD_E_DOWN;
+ }
+
+ return -n_acd_errno();
+}
+
+static void n_acd_remember_conflict(NAcd *acd, uint64_t now) {
+ if (++acd->n_conflicts >= N_ACD_RFC_MAX_CONFLICTS) {
+ acd->n_conflicts = N_ACD_RFC_MAX_CONFLICTS;
+ acd->last_conflict = now;
+ }
+}
+
+static int n_acd_handle_timeout(NAcd *acd) {
+ int r;
+
+ switch (acd->state) {
+ case N_ACD_STATE_PROBING:
+ /*
+ * We are still PROBING. We send 3 probes with a random timeout
+ * scheduled between each. If, after a fixed timeout, we did
+ * not receive any conflict we consider the probing successful.
+ */
+ if (acd->n_iteration >= N_ACD_RFC_PROBE_NUM) {
+ /*
+ * All 3 probes succeeded and we waited enough to
+ * consider this address usable by now. Do not announce
+ * the address, yet. We must first give the caller a
+ * chance to configure the address (so they can answer
+ * ARP requests), before announcing it. But our
+ * callbacks are not necessarily synchronous (we want
+ * to allow IPC there), so just notify the caller and
+ * wait for further instructions, thus effectively
+ * increasing the probe-wait.
+ */
+ r = n_acd_push_event(acd, N_ACD_EVENT_READY, NULL, NULL, NULL);
+ if (r)
+ return r;
+
+ acd->state = N_ACD_STATE_CONFIGURING;
+ } else {
+ /*
+ * We have not sent all 3 probes, yet. A timer fired,
+ * so we are ready to send the next probe. If this is
+ * the third probe, schedule a timer for ANNOUNCE_WAIT
+ * to give other peers a chance to answer. If this is
+ * not the third probe, wait between PROBE_MIN and
+ * PROBE_MAX for the next probe.
+ */
+
+ r = n_acd_send(acd, NULL);
+ if (r < 0)
+ return r;
+
+ if (++acd->n_iteration >= N_ACD_RFC_PROBE_NUM)
+ r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_USEC, 0);
+ else
+ r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_PROBE_MIN_USEC,
+ acd->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_USEC - N_ACD_RFC_PROBE_MIN_USEC));
+ if (r < 0)
+ return r;
+ }
+
+ break;
+
+ case N_ACD_STATE_ANNOUNCING:
+ /*
+ * We are ANNOUNCING, meaning the caller configured the address
+ * on the interface and is actively using it. We send 3
+ * announcements out, in a short interval, and then just
+ * perform passive conflict detection.
+ * Note that once all 3 announcements are sent, we no longer
+ * schedule a timer, so this part should not trigger, anymore.
+ */
+
+ r = n_acd_send(acd, &acd->config.ip);
+ if (r < 0)
+ return r;
+
+ if (++acd->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) {
+ r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_INTERVAL_USEC, 0);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+
+ case N_ACD_STATE_INIT:
+ case N_ACD_STATE_CONFIGURING:
+ default:
+ /*
+ * There are no timeouts in these states. If we trigger one,
+ * something is fishy. Let the caller deal with this.
+ */
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int n_acd_handle_packet(NAcd *acd, struct ether_arp *packet) {
+ bool hard_conflict;
+ uint64_t now;
+ int r;
+
+ /*
+ * Via BPF we discard any non-conflict packets. There are only 2 types
+ * that can pass: A conflict on the Sender Protocol Address, or a
+ * conflict on the Target Protocol Address.
+ *
+ * The former we call a hard-conflict. It implies that the sender uses
+ * the address already. We must always catch this and in some way react
+ * to it. Any kind, REQUEST or REPLY must be caught (though it is
+ * unlikely that we ever catch REPLIES since they tend to be unicasts).
+ *
+ * However, in case the Target Protocol Address matches, we just know
+ * that somebody is looking for the address. Hence, we must also check
+ * that the packet is an ARP-Probe (Sender Protocol Address is 0). If
+ * it is, it means someone else does ACD on our address. We call this a
+ * soft conflict.
+ */
+ if (!memcmp(packet->arp_spa, (uint8_t[4]){ }, sizeof(packet->arp_spa)) &&
+ !memcmp(packet->arp_tpa, &acd->config.ip.s_addr, sizeof(packet->arp_tpa)) &&
+ packet->ea_hdr.ar_op == htobe16(ARPOP_REQUEST)) {
+ hard_conflict = false;
+ } else if (!memcmp(packet->arp_spa, &acd->config.ip.s_addr, sizeof(packet->arp_spa))) {
+ hard_conflict = true;
+ } else {
+ /*
+ * Ignore anything that is specific enough to match the BPF
+ * filter, but is none of the conflicts described above.
+ */
+ return 0;
+ }
+
+ r = n_acd_now(&now);
+ if (r < 0)
+ return r;
+
+ switch (acd->state) {
+ case N_ACD_STATE_PROBING:
+ /*
+ * Regardless whether this is a hard or soft conflict, we must
+ * treat this as a probe failure. That is, notify the caller of
+ * the conflict and wait for further instructions. We do not
+ * react to this, until the caller tells us what to do. But we
+ * immediately stop the engine, since there is no point in
+ * continuing the probing.
+ */
+ n_acd_remember_conflict(acd, now);
+ n_acd_stop(acd);
+ r = n_acd_push_event(acd, N_ACD_EVENT_USED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+ if (r)
+ return r;
+
+ break;
+
+ case N_ACD_STATE_CONFIGURING:
+ /*
+ * We are waiting for the caller to configure the interface and
+ * start ANNOUNCING. In this state, we cannot defend the address
+ * as that would indicate that it is ready to be used, and we
+ * cannot signal CONFLICT or USED as the caller may already have
+ * started to use the address (and may have configured the engine
+ * to always defend it, which means they should be able to rely on
+ * never losing it after READY). Simply drop the event, and rely
+ * on the anticipated ANNOUNCE to trigger it again.
+ */
+
+ break;
+
+ case N_ACD_STATE_ANNOUNCING:
+ /*
+ * We were already instructed to announce the address, which
+ * means the address is configured and in use. Hence, the
+ * caller is responsible to serve regular ARP queries. Meaning,
+ * we can ignore any soft conflicts (other peers doing ACD).
+ *
+ * But if we see a hard-conflict, we either defend the address
+ * according to the caller's instructions, or we report the
+ * conflict and bail out.
+ */
+
+ if (!hard_conflict)
+ break;
+
+ if (acd->defend == N_ACD_DEFEND_NEVER) {
+ n_acd_remember_conflict(acd, now);
+ n_acd_stop(acd);
+ r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+ if (r)
+ return r;
+ } else {
+ if (now > acd->last_defend + N_ACD_RFC_DEFEND_INTERVAL_USEC) {
+ r = n_acd_send(acd, &acd->config.ip);
+ if (r < 0)
+ return r;
+
+ acd->last_defend = now;
+ r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+ if (r)
+ return r;
+ } else if (acd->defend == N_ACD_DEFEND_ONCE) {
+ n_acd_remember_conflict(acd, now);
+ n_acd_stop(acd);
+ r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+ if (r)
+ return r;
+ } else {
+ r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+ if (r)
+ return r;
+ }
+ }
+
+ break;
+
+ case N_ACD_STATE_INIT:
+ default:
+ /*
+ * The socket should not be dispatched in those states, since
+ * it is neither allocated nor added to epoll. Fail hard if we
+ * trigger this somehow.
+ */
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int n_acd_dispatch_timer(NAcd *acd, struct epoll_event *event) {
+ uint64_t v;
+ int r;
+
+ if (event->events & (EPOLLHUP | EPOLLERR)) {
+ /*
+ * There is no way to handle either gracefully. If we ignored
+ * them, we would busy-loop, so lets rather forward the error
+ * to the caller.
+ */
+ return -EIO;
+ }
+
+ if (event->events & EPOLLIN) {
+ for (unsigned int i = 0; i < 128; ++i) {
+ r = read(acd->fd_timer, &v, sizeof(v));
+ if (r == sizeof(v)) {
+ /*
+ * We successfully read a timer-value. Handle it and
+ * return. We do NOT fall-through to EPOLLHUP handling,
+ * as we always must drain buffers first.
+ */
+ return n_acd_handle_timeout(acd);
+ } else if (r >= 0) {
+ /*
+ * Kernel guarantees 8-byte reads; fail hard if it
+ * suddenly starts doing weird shit. No clue what to do
+ * with those values, anyway.
+ */
+ return -EIO;
+ } else if (errno == EAGAIN) {
+ /*
+ * No more pending events.
+ */
+ return 0;
+ } else {
+ /*
+ * Something failed. We use CLOCK_BOOTTIME, so
+ * ECANCELED cannot happen. Hence, there is no error
+ * that we could gracefully handle. Fail hard and let
+ * the caller deal with it.
+ */
+ return -n_acd_errno();
+ }
+ }
+
+ return N_ACD_E_PREEMPTED;
+ }
+
+ return 0;
+}
+
+static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) {
+ struct ether_arp packet;
+ ssize_t l;
+
+ for (unsigned int i = 0; i < 128; ++i) {
+ /*
+ * Regardless whether EPOLLIN is set in @event->events, we always
+ * invoke recv(2). This is a safety-net for sockets, which always fetch
+ * queued errors on all syscalls. That means, if anything failed on the
+ * socket, we will be notified via recv(2). This simplifies the code
+ * and avoid magic EPOLLIN/ERR/HUP juggling.
+ *
+ * Note that we must use recv(2) over read(2), since the latter cannot
+ * deal with empty packets properly.
+ */
+ l = recv(acd->fd_socket, &packet, sizeof(packet), MSG_TRUNC);
+ if (l == (ssize_t)sizeof(packet)) {
+ /*
+ * We read a full ARP packet. We never fall-through to EPOLLHUP
+ * handling, as we always must drain buffers first.
+ */
+ return n_acd_handle_packet(acd, &packet);
+ } else if (l >= 0) {
+ /*
+ * The BPF filter discards wrong packets, so error out
+ * if something slips through for any reason. Don't silently
+ * ignore it, since we explicitly want to know if something
+ * went fishy.
+ */
+ return -EIO;
+ } else if (errno == ENETDOWN || errno == ENXIO) {
+ /*
+ * We get ENETDOWN if the network-device goes down or is
+ * removed. ENXIO might happen on async send-operations if the
+ * network-device was unplugged and thus the kernel is no
+ * longer aware of it.
+ * In any case, we do not allow proceeding with this socket. We
+ * stop the engine and notify the user gracefully.
+ */
+ return -N_ACD_E_DOWN;
+ } else if (errno == EAGAIN) {
+ /*
+ * We cannot read data from the socket (we got EAGAIN). As a safety net
+ * check for EPOLLHUP/ERR. Those cannot be disabled with epoll, so we
+ * must make sure to not busy-loop by ignoring them. Note that we know
+ * recv(2) on sockets to return an error if either of these epoll-flags
+ * is set. Hence, if we did not handle it above, we have no other way
+ * but treating those flags as fatal errors and returning them to the
+ * caller.
+ */
+ if (event->events & (EPOLLHUP | EPOLLERR))
+ return -EIO;
+
+ return 0;
+ } else {
+ /*
+ * Cannot dispatch the packet. This might be due to OOM, HUP,
+ * or something else. We cannot handle it gracefully so forward
+ * to the caller.
+ */
+ return -n_acd_errno();
+ }
+ }
+
+ return N_ACD_E_PREEMPTED;
+}
+
+/**
+ * n_acd_dispatch() - dispatch ACD context
+ * @acd: ACD context
+ *
+ * Return: 0 on successful dispatch of all pending events, N_ACD_E_PREEMPT in
+ * case there are more still more events to be dispatched, or a
+ * negative error code on failure.
+ */
+_public_ int n_acd_dispatch(NAcd *acd) {
+ struct epoll_event events[2];
+ int n, i, r = 0;
+ bool preempted = false;
+
+ n = epoll_wait(acd->fd_epoll, events, sizeof(events) / sizeof(*events), 0);
+ if (n < 0) {
+ return -n_acd_errno();
+ }
+
+ for (i = 0; i < n; ++i) {
+ switch (events[i].data.u32) {
+ case N_ACD_EPOLL_TIMER:
+ r = n_acd_dispatch_timer(acd, events + i);
+ break;
+ case N_ACD_EPOLL_SOCKET:
+ r = n_acd_dispatch_socket(acd, events + i);
+ break;
+ default:
+ r = 0;
+ break;
+ }
+
+ if (r == N_ACD_E_PREEMPTED)
+ preempted = true;
+ else if (r != 0)
+ break;
+ }
+
+ if (r == -N_ACD_E_DOWN) {
+ /*
+ * N_ACD_E_DOWN is synthesized whenever we notice
+ * ENETDOWN-related errors on the network interface. This
+ * allows bailing out of deep call-paths and then handling the
+ * error gracefully here.
+ */
+ n_acd_stop(acd);
+ r = n_acd_push_event(acd, N_ACD_EVENT_DOWN, NULL, NULL, NULL);
+ if (r)
+ return r;
+
+ return 0;
+ }
+
+ if (preempted)
+ return N_ACD_E_PREEMPTED;
+ else
+ return r;
+}
+
+/**
+ * n_acd_pop_event() - get the next pending event
+ * @acd: ACD context
+ * @eventp: output argument for the event
+ *
+ * Returns a pointer to the next pending event. The event is still owend by
+ * the context, and is only valid until the next call to n_acd_pop_event()
+ * or until the context is freed.
+ *
+ * The possible events are:
+ * * N_ACD_EVENT_READY: The configured IP address was probed successfully
+ * and is ready to be used. Once configured on the
+ * interface, the caller must call n_acd_announce()
+ * to announce and start defending the address.
+ * No further events may be received before
+ * n_acd_announce() has been called.
+ * * N_ACD_EVENT_USED: Someone is already using the IP address being
+ * probed. The engine was stopped, and the caller
+ * may restart it to try again.
+ * * N_ACD_EVENT_DEFENDED: A conflict was detected for the announced IP
+ * address, and the engine attempted to defend it.
+ * This is purely informational, and no action is
+ * required by the caller.
+ * * N_ACD_EVENT_CONFLICT: A conflict was detected for the announced IP
+ * address, and the engine was not able to defend
+ * it (according to the configured policy). The
+ * engine has stoppde, the caller must stop using
+ * the address immediately, and may restart the
+ * engine to retry.
+ * * N_ACD_EVENT_DOWN: A network error was detected. The engine was
+ * stopped and it is the responsibility of the
+ * caller to restart it once the network may be
+ * functional again.
+ *
+ * Returns: 0 on success, N_ACD_E_STOPPED if there are no more events and
+ * the engine has been stopped, N_ACD_E_DONE if there are no more
+ * events, but the engine is still running, or a negative error
+ * code on failure.
+ */
+_public_ int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp) {
+ acd->current = n_acd_event_node_free(acd->current);
+
+ if (c_list_is_empty(&acd->events)) {
+ if (acd->state == N_ACD_STATE_INIT)
+ return N_ACD_E_STOPPED;
+ else
+ return N_ACD_E_DONE;
+ }
+
+ acd->current = c_list_first_entry(&acd->events, NAcdEventNode, link);
+ c_list_unlink(&acd->current->link);
+
+ if (eventp)
+ *eventp = &acd->current->event;
+
+ return 0;
+}
+
+static int n_acd_bind_socket(NAcd *acd, int s) {
+ /*
+ * Due to strict aliasing, we cannot get uint32_t/uint16_t pointers to
+ * acd->config.mac, so provide a union accessor.
+ */
+ const union {
+ uint8_t u8[6];
+ uint16_t u16[3];
+ uint32_t u32[1];
+ } mac = {
+ .u8 = {
+ acd->mac[0],
+ acd->mac[1],
+ acd->mac[2],
+ acd->mac[3],
+ acd->mac[4],
+ acd->mac[5],
+ },
+ };
+ struct sock_filter filter[] = {
+ /*
+ * Basic ARP header validation. Make sure the packet-length,
+ * wire type, protocol type, and address lengths are correct.
+ */
+ BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0), /* A <- packet length */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_arp), 1, 0), /* packet == arp packet ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hrd)), /* A <- header */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPHRD_ETHER, 1, 0), /* header == ethernet ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pro)), /* A <- protocol */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 1, 0), /* protocol == IP ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hln)), /* A <- hardware address length */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_addr), 1, 0), /* length == sizeof(ether_addr)? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pln)), /* A <- protocol address length */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct in_addr), 1, 0), /* length == sizeof(in_addr) ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_op)), /* A <- operation */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REQUEST, 2, 0), /* protocol == request ? */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REPLY, 1, 0), /* protocol == reply ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+
+ /*
+ * Sender hardware address must be different from ours. Note
+ * that BPF runs in big-endian mode, but assumes immediates are
+ * given in native-endian. This might look weird on 6-byte mac
+ * addresses, but is needed to revert the BPF magic.
+ */
+ BPF_STMT(BPF_LD + BPF_IMM, be32toh(mac.u32[0])), /* A <- 4 bytes of client's MAC */
+ BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */
+ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_sha)), /* A <- 4 bytes of SHA */
+ BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* A xor X */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 6), /* A == 0 ? */
+ BPF_STMT(BPF_LD + BPF_IMM, be16toh(mac.u16[2])), /* A <- remainder of client's MAC */
+ BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */
+ BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, arp_sha) + 4), /* A <- remainder of SHA */
+ BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* A xor X */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+
+ /*
+ * Sender protocol address or target protocol address must be
+ * equal to the one we care about. Again, immediates must be
+ * given in native-endian.
+ */
+ BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)), /* A <- clients IP */
+ BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */
+ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_spa)), /* A <- SPA */
+ BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* X xor A */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */
+ BPF_STMT(BPF_RET + BPF_K, 65535), /* return all */
+ BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)), /* A <- clients IP */
+ BPF_STMT(BPF_MISC + BPF_TAX, 0), /* X <- A */
+ BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_tpa)), /* A <- TPA */
+ BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0), /* X xor A */
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1), /* A == 0 ? */
+ BPF_STMT(BPF_RET + BPF_K, 65535), /* return all */
+ BPF_STMT(BPF_RET + BPF_K, 0), /* ignore */
+ };
+ const struct sock_fprog fprog = {
+ .len = sizeof(filter) / sizeof(*filter),
+ .filter = filter,
+ };
+ const struct sockaddr_ll address = {
+ .sll_family = AF_PACKET,
+ .sll_protocol = htobe16(ETH_P_ARP),
+ .sll_ifindex = acd->config.ifindex,
+ .sll_halen = ETH_ALEN,
+ .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ };
+ int r;
+
+ /*
+ * Install a packet filter that matches on the ARP header and
+ * addresses, to reduce the number of wake-ups to a minimum.
+ */
+ r = setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog));
+ if (r < 0)
+ return -n_acd_errno();
+
+ /*
+ * Bind the packet-socket to ETH_P_ARP and the specified network
+ * interface.
+ */
+ r = bind(s, (struct sockaddr *)&address, sizeof(address));
+ if (r < 0)
+ return -n_acd_errno();
+
+ return 0;
+}
+
+static int n_acd_setup_socket(NAcd *acd) {
+ int r, s;
+
+ s = socket(PF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0);
+ if (s < 0)
+ return -n_acd_errno();
+
+ r = n_acd_bind_socket(acd, s);
+ if (r < 0)
+ goto error;
+
+ r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, s,
+ &(struct epoll_event){
+ .events = EPOLLIN,
+ .data.u32 = N_ACD_EPOLL_SOCKET,
+ });
+ if (r < 0) {
+ r = -n_acd_errno();
+ goto error;
+ }
+
+ acd->fd_socket = s;
+ return 0;
+
+error:
+ close(s);
+ return r;
+}
+
+/**
+ * n_acd_start() - start the ACD engine
+ * @acd: ACD context
+ * @config: description of interface and desired IP address
+ *
+ * Start probing the given address on the given interface.
+ *
+ * The engine must not already be running, and there must not be
+ * any pending events.
+ *
+ * Returns: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the configuration
+ * was invalid, N_ACD_E_BUSY if the engine is running or there are
+ * pending events, or a negative error code on failure.
+ */
+_public_ int n_acd_start(NAcd *acd, NAcdConfig *config) {
+ uint64_t now, delay;
+ int r;
+
+ if (config->ifindex <= 0 ||
+ config->transport != N_ACD_TRANSPORT_ETHERNET ||
+ config->n_mac != ETH_ALEN ||
+ !memcmp(config->mac, (uint8_t[ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ETH_ALEN) ||
+ !config->ip.s_addr ||
+ !config->timeout_msec)
+ return N_ACD_E_INVALID_ARGUMENT;
+
+ if (acd->state != N_ACD_STATE_INIT || !c_list_is_empty(&acd->events))
+ return N_ACD_E_BUSY;
+
+ acd->config = *config;
+ memcpy(acd->mac, config->mac, config->n_mac);
+ acd->config.mac = acd->mac;
+ acd->timeout_multiplier = config->timeout_msec;
+
+ r = n_acd_setup_socket(acd);
+ if (r < 0)
+ goto error;
+
+ delay = 0;
+ if (acd->last_conflict != TIME_INFINITY) {
+ r = n_acd_now(&now);
+ if (r < 0)
+ goto error;
+
+ if (now < acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC)
+ delay = acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC - now;
+ }
+
+ r = n_acd_schedule(acd, delay, acd->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_USEC);
+ if (r < 0)
+ goto error;
+
+ acd->state = N_ACD_STATE_PROBING;
+ acd->defend = N_ACD_DEFEND_NEVER;
+ acd->n_iteration = 0;
+ acd->last_defend = 0;
+ return 0;
+
+error:
+ n_acd_stop(acd);
+ return r;
+}
+
+/**
+ * n_acd_stop() - stop the ACD engine
+ * @acd: ACD context
+ *
+ * Stop the engine. No new events may be triggered, but pending events are not
+ * flushed. Before calling n_acd_start() again all pending events must be popped.
+ */
+_public_ void n_acd_stop(NAcd *acd) {
+ acd->state = N_ACD_STATE_INIT;
+ acd->defend = N_ACD_DEFEND_NEVER;
+ acd->n_iteration = 0;
+ acd->last_defend = 0;
+ timerfd_settime(acd->fd_timer, 0, &(struct itimerspec){}, NULL);
+
+ if (acd->fd_socket >= 0) {
+ assert(acd->fd_epoll >= 0);
+ epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_socket, NULL);
+ close(acd->fd_socket);
+ acd->fd_socket = -1;
+ }
+}
+
+/**
+ * n_acd_announce() - announce the configured IP address
+ * @acd: ACD context
+ * @defend: defence policy
+ *
+ * Announce the IP address on the local link, and start defending it according
+ * to the given policy, which mut be one of N_ACD_DEFEND_ONCE,
+ * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS.
+ *
+ * This must be called after the engine in response to an N_ACD_EVENT_READY
+ * event, and only after the given address has been configured on the given
+ * interface.
+ *
+ * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defence policy
+ * is invalid, N_ACD_E_BUSY if this is not in response to a
+ * N_ACD_EVENT_READY event, or a negative error code on failure.
+ */
+_public_ int n_acd_announce(NAcd *acd, unsigned int defend) {
+ uint64_t now;
+ int r;
+
+ if (defend >= _N_ACD_DEFEND_N)
+ return N_ACD_E_INVALID_ARGUMENT;
+ if (acd->state != N_ACD_STATE_CONFIGURING)
+ return N_ACD_E_BUSY;
+
+ /*
+ * Sending announcements means we finished probing and use the address
+ * now. We therefore reset the conflict counter in case we adhered to
+ * the rate-limit. Since probing is properly delayed, a well-behaving
+ * client will always reset the conflict counter here. However, if you
+ * force-use an address regardless of conflicts, then this will not
+ * trigger and the conflict counter stays untouched.
+ */
+ if (acd->last_conflict != TIME_INFINITY) {
+ r = n_acd_now(&now);
+ if (r < 0)
+ return r;
+
+ if (now >= acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC)
+ acd->n_conflicts = 0;
+ }
+
+ /*
+ * Instead of sending the first announcement here, we schedule an idle
+ * timer. This avoids possibly recursing into the user callback. We
+ * should never trigger callbacks from arbitrary stacks, but always
+ * restrict them to the dispatcher.
+ */
+ r = n_acd_schedule(acd, 0, 0);
+ if (r < 0)
+ return r;
+
+ acd->state = N_ACD_STATE_ANNOUNCING;
+ acd->defend = defend;
+ acd->n_iteration = 0;
+ return 0;
+}