1 files changed, 1213 insertions, 0 deletions
diff --git a/src/n-acd.c b/src/n-acd.c
new file mode 100644
index 0000000000..4f8023e896
--- /dev/null
+++ b/src/n-acd.c
@@ -0,0 +1,1213 @@
+/*
+ * IPv4 Address Conflict Detection
+ *
+ * This implements the main n-acd API. It is built around an epoll-fd to
+ * encapsulate a timerfd+socket. The n-acd context has quite straightforward
+ * lifetime rules. The parameters must be set when the engine is started, and
+ * they can only be changed by stopping and restartding the engine. The engine
+ * is started on demand and stopped when no longer needed.
+ * During the entire lifetime the context can be dispatched. That is, the
+ * dispatcher does not have to be aware of the context state. After each call
+ * to dispatch(), the caller must pop all pending events until -EAGAIN is
+ * returned.
+ *
+ * If a conflict is detected, the ACD engine reports to the caller and stops
+ * the engine. The caller can now modify parameters and restart the engine, if
+ * required.
+ */
+
+#include <assert.h>
+#include <c-list.h>
+#include <c-siphash.h>
+#include <endian.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/timerfd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "n-acd.h"
+
+#define _public_ __attribute__((__visibility__("default")))
+
+/*
+ * These parameters and timing intervals specified in RFC-5227. The original
+ * values are:
+ *
+ *     PROBE_NUM                                3
+ *     PROBE_WAIT                               1s
+ *     PROBE_MIN                                1s
+ *     PROBE_MAX                                3s
+ *     ANNOUNCE_NUM                             3
+ *     ANNOUNCE_WAIT                            2s
+ *     ANNOUNCE_INTERVAL                        2s
+ *     MAX_CONFLICTS                            10
+ *     RATE_LIMIT_INTERVAL                      60s
+ *     DEFEND_INTERVAL                          10s
+ *
+ * If we assume a best-case and worst-case scenario for non-conflicted runs, we
+ * end up with a runtime between 4s and 9s to finish the probe. Then it still
+ * takes a fixed 4s to finish the announcements.
+ *
+ * RFC 5227 section 1.1:
+ *     [...] (Note that the values listed here are fixed constants; they are
+ *     not intended to be modifiable by implementers, operators, or end users.
+ *     These constants are given symbolic names here to facilitate the writing
+ *     of future standards that may want to reference this document with
+ *     different values for these named constants; however, at the present time
+ *     no such future standards exist.) [...]
+ *
+ * Unfortunately, no-one ever stepped up to write a "future standard" to revise
+ * the timings. A 9s timeout for successful link setups is not acceptable today.
+ * Hence, we will just go forward and ignore the proposed values. On both
+ * wired and wireless local links round-trip latencies of below 3ms are common,
+ * while latencies above 10ms are rarely seen. We require the caller to set a
+ * timeout multiplier, where 1 corresponds to a total probe time of 0.5 ms and
+ * 1.0 ms. On modern networks a multiplier of about 100 should be a reasonable
+ * default. To comply with the RFC select a multiplier of 9000.
+ */
+#define N_ACD_RFC_PROBE_NUM                     (3)
+#define N_ACD_RFC_PROBE_WAIT_USEC               (UINT64_C(111)) /* 111us */
+#define N_ACD_RFC_PROBE_MIN_USEC                (UINT64_C(111)) /* 111us */
+#define N_ACD_RFC_PROBE_MAX_USEC                (UINT64_C(333)) /* 333us */
+#define N_ACD_RFC_ANNOUNCE_NUM                  (3)
+#define N_ACD_RFC_ANNOUNCE_WAIT_USEC            (UINT64_C(222)) /* 222us */
+#define N_ACD_RFC_ANNOUNCE_INTERVAL_USEC        (UINT64_C(222)) /* 222us */
+#define N_ACD_RFC_MAX_CONFLICTS                 (10)
+#define N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC      (UINT64_C(60000000)) /* 60s */
+#define N_ACD_RFC_DEFEND_INTERVAL_USEC          (UINT64_C(10000000)) /* 10s */
+
+/*
+ * Fake ENETDOWN error-code. We use this as replacement for known EFOOBAR error
+ * codes. It is explicitly chosen to be outside the known error-code range.
+ * Whenever we are deep down in a call-stack and notice a ENETDOWN error, we
+ * return this instead. It is caught by the top-level dispatcher and then
+ * properly handled.
+ * This avoids gracefully handling ENETDOWN in call-stacks, but then continuing
+ * with some work in the callers without noticing the soft failure.
+ */
+#define N_ACD_E_DOWN (INT_MAX)
+
+#define TIME_INFINITY ((uint64_t) -1)
+
+enum {
+        N_ACD_EPOLL_TIMER,
+        N_ACD_EPOLL_SOCKET,
+};
+
+enum {
+        N_ACD_STATE_INIT,
+        N_ACD_STATE_PROBING,
+        N_ACD_STATE_CONFIGURING,
+        N_ACD_STATE_ANNOUNCING,
+};
+
+typedef struct NAcdEventNode {
+        NAcdEvent event;
+        uint8_t sender[ETH_ALEN];
+        CList link;
+} NAcdEventNode;
+
+struct NAcd {
+        /* context */
+        unsigned int seed;
+        int fd_epoll;
+        int fd_timer;
+
+        /* configuration */
+        NAcdConfig config;
+        uint8_t mac[ETH_ALEN];
+        uint64_t timeout_multiplier;
+
+        /* runtime */
+        int fd_socket;
+        unsigned int state;
+        unsigned int n_iteration;
+        unsigned int n_conflicts;
+        unsigned int defend;
+        uint64_t last_defend;
+        uint64_t last_conflict;
+
+        /* pending events */
+        CList events;
+        NAcdEventNode *current;
+};
+
+static int n_acd_errno(void) {
+        /*
+         * Compilers continuously warn about uninitialized variables since they
+         * cannot deduce that `return -errno;` will always be negative. This
+         * small wrapper makes sure compilers figure that out. Use it as
+         * replacement for `errno` read access. Yes, it generates worse code,
+         * but only marginally and only affects slow-paths.
+         */
+        return abs(errno) ? : EIO;
+}
+
+static int n_acd_event_node_new(NAcdEventNode **nodep, unsigned int event) {
+        NAcdEventNode *node;
+
+        node = calloc(1, sizeof(*node));
+        if (!node)
+                return -ENOMEM;
+
+        node->event.event = event;
+        node->link = (CList)C_LIST_INIT(node->link);
+
+        *nodep = node;
+
+        return 0;
+}
+
+static NAcdEventNode *n_acd_event_node_free(NAcdEventNode *node) {
+        if (!node)
+                return NULL;
+
+        c_list_unlink(&node->link);
+        free(node);
+
+        return NULL;
+}
+
+static int n_acd_get_random(unsigned int *random) {
+        uint8_t hash_seed[] = { 0x3a, 0x0c, 0xa6, 0xdd, 0x44, 0xef, 0x5f, 0x7a, 0x5e, 0xd7, 0x25, 0x37, 0xbf, 0x4e, 0x80, 0xa1 };
+        CSipHash hash = C_SIPHASH_NULL;
+        struct timespec ts;
+        const uint8_t *p;
+        int r;
+
+        /*
+         * We need random jitter for all timeouts when handling ARP probes. Use
+         * AT_RANDOM to get a seed for rand_r(3p), if available (should always
+         * be available on linux). See the time-out scheduler for details.
+         * Additionally, we include the current time in the seed. This avoids
+         * using the same jitter in case you run multiple ACD engines in the
+         * same process. Lastly, the seed is hashed with SipHash24 to avoid
+         * exposing the value of AT_RANDOM on the network.
+         */
+        c_siphash_init(&hash, hash_seed);
+
+        p = (const uint8_t *)getauxval(AT_RANDOM);
+        if (p)
+                c_siphash_append(&hash, p, 16);
+
+        r = clock_gettime(CLOCK_BOOTTIME, &ts);
+        if (r < 0)
+                return -n_acd_errno();
+
+        c_siphash_append(&hash, (const uint8_t *)&ts.tv_sec, sizeof(ts.tv_sec));
+        c_siphash_append(&hash, (const uint8_t *)&ts.tv_nsec, sizeof(ts.tv_nsec));
+
+        *random = c_siphash_finalize(&hash);
+        return 0;
+}
+
+/**
+ * n_acd_new() - create a new ACD context
+ * @acdp:       output argument for context
+ *
+ * Create a new ACD context and return it in @acdp.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+_public_ int n_acd_new(NAcd **acdp) {
+        NAcd *acd;
+        int r;
+
+        acd = calloc(1, sizeof(*acd));
+        if (!acd)
+                return -ENOMEM;
+
+        acd->fd_epoll = -1;
+        acd->fd_timer = -1;
+        acd->fd_socket = -1;
+        acd->state = N_ACD_STATE_INIT;
+        acd->defend = N_ACD_DEFEND_NEVER;
+        acd->events = (CList)C_LIST_INIT(acd->events);
+        acd->last_conflict = TIME_INFINITY;
+
+        r = n_acd_get_random(&acd->seed);
+        if (r < 0)
+                return r;
+
+        acd->fd_epoll = epoll_create1(EPOLL_CLOEXEC);
+        if (acd->fd_epoll < 0) {
+                r = -n_acd_errno();
+                goto error;
+        }
+
+        acd->fd_timer = timerfd_create(CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK);
+        if (acd->fd_timer < 0) {
+                r = -n_acd_errno();
+                goto error;
+        }
+
+        r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, acd->fd_timer,
+                      &(struct epoll_event){
+                              .events = EPOLLIN,
+                              .data.u32 = N_ACD_EPOLL_TIMER,
+                      });
+        if (r < 0) {
+                r = -n_acd_errno();
+                goto error;
+        }
+
+        *acdp = acd;
+        return 0;
+
+error:
+        n_acd_free(acd);
+        return r;
+}
+
+/**
+ * n_acd_free() - free an ACD context
+ *
+ * Frees all resources held by the context. This may be called at any time,
+ * but doing so invalidates all data owned by the context.
+ *
+ * Return: NULL.
+ */
+_public_ NAcd *n_acd_free(NAcd *acd) {
+        NAcdEventNode *node;
+
+        if (!acd)
+                return NULL;
+
+        n_acd_stop(acd);
+
+        while ((node = c_list_first_entry(&acd->events, NAcdEventNode, link)))
+                n_acd_event_node_free(node);
+
+        assert(acd->fd_socket < 0);
+
+        if (acd->fd_timer >= 0) {
+                assert(acd->fd_epoll >= 0);
+                epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_timer, NULL);
+                close(acd->fd_timer);
+                acd->fd_timer = -1;
+        }
+
+        if (acd->fd_epoll >= 0) {
+                close(acd->fd_epoll);
+                acd->fd_epoll = -1;
+        }
+
+        free(acd);
+
+        return NULL;
+}
+
+/**
+ * n_acd_get_fd() - get pollable file descriptor
+ * @acd:        ACD context
+ * @fdp:        output argument for file descriptor
+ *
+ * Returns a file descriptor in @fdp. This filedescriptor can be polled by
+ * the caller to indicate when the ACD context can be dispatched.
+ */
+_public_ void n_acd_get_fd(NAcd *acd, int *fdp) {
+        *fdp = acd->fd_epoll;
+}
+
+static int n_acd_push_event(NAcd *acd, unsigned int event, uint16_t *operation, uint8_t (*sender)[6], uint8_t (*target)[4]) {
+        NAcdEventNode *node;
+        int r;
+
+        r = n_acd_event_node_new(&node, event);
+        if (r < 0)
+                return r;
+
+        switch (event) {
+        case N_ACD_EVENT_USED:
+                node->event.used.operation = be16toh(*operation);
+                memcpy(node->sender, sender, sizeof(node->sender));
+                node->event.used.sender = node->sender;
+                node->event.used.n_sender = sizeof(node->sender);
+                memcpy(&node->event.used.target, target, sizeof(node->event.used.target));
+                break;
+        case N_ACD_EVENT_CONFLICT:
+                node->event.conflict.operation = be16toh(*operation);
+                memcpy(node->sender, sender, sizeof(node->sender));
+                node->event.used.sender = node->sender;
+                node->event.used.n_sender = sizeof(node->sender);
+                memcpy(&node->event.conflict.target, target, sizeof(node->event.conflict.target));
+                break;
+        case N_ACD_EVENT_DEFENDED:
+                node->event.defended.operation = be16toh(*operation);
+                memcpy(node->sender, sender, sizeof(node->sender));
+                node->event.used.sender = node->sender;
+                node->event.used.n_sender = sizeof(node->sender);
+                memcpy(&node->event.defended.target, target, sizeof(node->event.defended.target));
+                break;
+        case N_ACD_EVENT_READY:
+        case N_ACD_EVENT_DOWN:
+                break;
+        default:
+                assert(0);
+        }
+
+        c_list_link_tail(&acd->events, &node->link);
+
+        return 0;
+}
+
+static int n_acd_now(uint64_t *nowp) {
+        struct timespec ts;
+        int r;
+
+        r = clock_gettime(CLOCK_BOOTTIME, &ts);
+        if (r < 0)
+                return -n_acd_errno();
+
+        *nowp = ts.tv_sec * UINT64_C(1000000) + ts.tv_nsec / UINT64_C(1000);
+        return 0;
+}
+
+static int n_acd_schedule(NAcd *acd, uint64_t u_timeout, unsigned int u_jitter) {
+        uint64_t u_next = u_timeout;
+        int r;
+
+        /*
+         * ACD specifies jitter values to reduce packet storms on the local
+         * link. This call accepts the maximum relative jitter value in
+         * microseconds as @u_jitter. We then use rand_r(3p) to get a
+         * pseudo-random jitter on top of the real timeout given as @u_timeout.
+         * Note that rand_r() is fine for this. Before you try to improve the
+         * RNG, you better spend some time securing ARP.
+         */
+        if (u_jitter)
+                u_next += rand_r(&acd->seed) % u_jitter;
+
+        /*
+         * Setting .it_value to 0 in timerfd_settime() disarms the timer. Avoid
+         * this and always schedule at least 1us. Otherwise, we'd have to
+         * recursively call into the time-out handler, which we really want to
+         * avoid. No reason to optimize performance here.
+         */
+        if (!u_next)
+                u_next = 1;
+
+        r = timerfd_settime(acd->fd_timer, 0,
+                            &(struct itimerspec){ .it_value = {
+                                    .tv_sec = u_next / UINT64_C(1000000),
+                                    .tv_nsec = u_next % UINT64_C(1000000) * UINT64_C(1000),
+                            } }, NULL);
+        if (r < 0)
+                return -n_acd_errno();
+
+        return 0;
+}
+
+static int n_acd_send(NAcd *acd, const struct in_addr *spa) {
+        struct sockaddr_ll address = {
+                .sll_family = AF_PACKET,
+                .sll_protocol = htobe16(ETH_P_ARP),
+                .sll_ifindex = acd->config.ifindex,
+                .sll_halen = ETH_ALEN,
+                .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+        };
+        struct ether_arp arp = {
+                .ea_hdr.ar_hrd = htobe16(ARPHRD_ETHER),
+                .ea_hdr.ar_pro = htobe16(ETHERTYPE_IP),
+                .ea_hdr.ar_hln = sizeof(acd->mac),
+                .ea_hdr.ar_pln = sizeof(uint32_t),
+                .ea_hdr.ar_op = htobe16(ARPOP_REQUEST),
+        };
+        ssize_t l;
+
+        memcpy(arp.arp_sha, acd->mac, sizeof(acd->mac));
+        memcpy(arp.arp_tpa, &acd->config.ip.s_addr, sizeof(uint32_t));
+
+        if (spa)
+                memcpy(arp.arp_spa, &spa->s_addr, sizeof(spa->s_addr));
+
+        l = sendto(acd->fd_socket, &arp, sizeof(arp), MSG_NOSIGNAL, (struct sockaddr *)&address, sizeof(address));
+        if (l == (ssize_t)sizeof(arp)) {
+                /* Packet was properly sent. */
+                return 0;
+        } else if (l >= 0) {
+                /*
+                 * Ugh. The packet was truncated. This should not happen, but
+                 * lets just pretend the packet was dropped.
+                 */
+                return 0;
+        } else if (errno == EAGAIN || errno == ENOBUFS) {
+                /*
+                 * In case the output buffer is full, the packet is silently
+                 * dropped. This is just as if the physical layer happened to
+                 * drop the packet. We are not on a reliable medium, so no
+                 * reason to pretend we are.
+                 */
+                return 0;
+        } else if (errno == ENETDOWN || errno == ENXIO) {
+                /*
+                 * We get ENETDOWN if the network-device goes down or is
+                 * removed. ENXIO might happen on async send-operations if the
+                 * network-device was unplugged and thus the kernel is no
+                 * longer aware of it.
+                 * In any case, we do not allow proceeding with this socket. We
+                 * stop the engine and notify the user gracefully.
+                 */
+                return -N_ACD_E_DOWN;
+        }
+
+        return -n_acd_errno();
+}
+
+static void n_acd_remember_conflict(NAcd *acd, uint64_t now) {
+        if (++acd->n_conflicts >= N_ACD_RFC_MAX_CONFLICTS) {
+                acd->n_conflicts = N_ACD_RFC_MAX_CONFLICTS;
+                acd->last_conflict = now;
+        }
+}
+
+static int n_acd_handle_timeout(NAcd *acd) {
+        int r;
+
+        switch (acd->state) {
+        case N_ACD_STATE_PROBING:
+                /*
+                 * We are still PROBING. We send 3 probes with a random timeout
+                 * scheduled between each. If, after a fixed timeout, we did
+                 * not receive any conflict we consider the probing successful.
+                 */
+                if (acd->n_iteration >= N_ACD_RFC_PROBE_NUM) {
+                        /*
+                         * All 3 probes succeeded and we waited enough to
+                         * consider this address usable by now. Do not announce
+                         * the address, yet. We must first give the caller a
+                         * chance to configure the address (so they can answer
+                         * ARP requests), before announcing it. But our
+                         * callbacks are not necessarily synchronous (we want
+                         * to allow IPC there), so just notify the caller and
+                         * wait for further instructions, thus effectively
+                         * increasing the probe-wait.
+                         */
+                        r = n_acd_push_event(acd, N_ACD_EVENT_READY, NULL, NULL, NULL);
+                        if (r)
+                                return r;
+
+                        acd->state = N_ACD_STATE_CONFIGURING;
+                } else {
+                        /*
+                         * We have not sent all 3 probes, yet. A timer fired,
+                         * so we are ready to send the next probe. If this is
+                         * the third probe, schedule a timer for ANNOUNCE_WAIT
+                         * to give other peers a chance to answer. If this is
+                         * not the third probe, wait between PROBE_MIN and
+                         * PROBE_MAX for the next probe.
+                         */
+
+                        r = n_acd_send(acd, NULL);
+                        if (r < 0)
+                                return r;
+
+                        if (++acd->n_iteration >= N_ACD_RFC_PROBE_NUM)
+                                r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_WAIT_USEC, 0);
+                        else
+                                r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_PROBE_MIN_USEC,
+                                                   acd->timeout_multiplier * (N_ACD_RFC_PROBE_MAX_USEC - N_ACD_RFC_PROBE_MIN_USEC));
+                        if (r < 0)
+                                return r;
+                }
+
+                break;
+
+        case N_ACD_STATE_ANNOUNCING:
+                /*
+                 * We are ANNOUNCING, meaning the caller configured the address
+                 * on the interface and is actively using it. We send 3
+                 * announcements out, in a short interval, and then just
+                 * perform passive conflict detection.
+                 * Note that once all 3 announcements are sent, we no longer
+                 * schedule a timer, so this part should not trigger, anymore.
+                 */
+
+                r = n_acd_send(acd, &acd->config.ip);
+                if (r < 0)
+                        return r;
+
+                if (++acd->n_iteration < N_ACD_RFC_ANNOUNCE_NUM) {
+                        r = n_acd_schedule(acd, acd->timeout_multiplier * N_ACD_RFC_ANNOUNCE_INTERVAL_USEC, 0);
+                        if (r < 0)
+                                return r;
+                }
+
+                break;
+
+        case N_ACD_STATE_INIT:
+        case N_ACD_STATE_CONFIGURING:
+        default:
+                /*
+                 * There are no timeouts in these states. If we trigger one,
+                 * something is fishy. Let the caller deal with this.
+                 */
+                return -EIO;
+        }
+
+        return 0;
+}
+
+static int n_acd_handle_packet(NAcd *acd, struct ether_arp *packet) {
+        bool hard_conflict;
+        uint64_t now;
+        int r;
+
+        /*
+         * Via BPF we discard any non-conflict packets. There are only 2 types
+         * that can pass: A conflict on the Sender Protocol Address, or a
+         * conflict on the Target Protocol Address.
+         *
+         * The former we call a hard-conflict. It implies that the sender uses
+         * the address already. We must always catch this and in some way react
+         * to it. Any kind, REQUEST or REPLY must be caught (though it is
+         * unlikely that we ever catch REPLIES since they tend to be unicasts).
+         *
+         * However, in case the Target Protocol Address matches, we just know
+         * that somebody is looking for the address. Hence, we must also check
+         * that the packet is an ARP-Probe (Sender Protocol Address is 0). If
+         * it is, it means someone else does ACD on our address. We call this a
+         * soft conflict.
+         */
+        if (!memcmp(packet->arp_spa, (uint8_t[4]){ }, sizeof(packet->arp_spa)) &&
+            !memcmp(packet->arp_tpa, &acd->config.ip.s_addr, sizeof(packet->arp_tpa)) &&
+            packet->ea_hdr.ar_op == htobe16(ARPOP_REQUEST)) {
+                hard_conflict = false;
+        } else if (!memcmp(packet->arp_spa, &acd->config.ip.s_addr, sizeof(packet->arp_spa))) {
+                hard_conflict = true;
+        } else {
+                /*
+                 * Ignore anything that is specific enough to match the BPF
+                 * filter, but is none of the conflicts described above.
+                 */
+                return 0;
+        }
+
+        r = n_acd_now(&now);
+        if (r < 0)
+                return r;
+
+        switch (acd->state) {
+        case N_ACD_STATE_PROBING:
+                /*
+                 * Regardless whether this is a hard or soft conflict, we must
+                 * treat this as a probe failure. That is, notify the caller of
+                 * the conflict and wait for further instructions. We do not
+                 * react to this, until the caller tells us what to do. But we
+                 * immediately stop the engine, since there is no point in
+                 * continuing the probing.
+                 */
+                n_acd_remember_conflict(acd, now);
+                n_acd_stop(acd);
+                r = n_acd_push_event(acd, N_ACD_EVENT_USED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+                if (r)
+                        return r;
+
+                break;
+
+        case N_ACD_STATE_CONFIGURING:
+                /*
+                 * We are waiting for the caller to configure the interface and
+                 * start ANNOUNCING. In this state, we cannot defend the address
+                 * as that would indicate that it is ready to be used, and we
+                 * cannot signal CONFLICT or USED as the caller may already have
+                 * started to use the address (and may have configured the engine
+                 * to always defend it, which means they should be able to rely on
+                 * never losing it after READY). Simply drop the event, and rely
+                 * on the anticipated ANNOUNCE to trigger it again.
+                 */
+
+                break;
+
+        case N_ACD_STATE_ANNOUNCING:
+                /*
+                 * We were already instructed to announce the address, which
+                 * means the address is configured and in use. Hence, the
+                 * caller is responsible to serve regular ARP queries. Meaning,
+                 * we can ignore any soft conflicts (other peers doing ACD).
+                 *
+                 * But if we see a hard-conflict, we either defend the address
+                 * according to the caller's instructions, or we report the
+                 * conflict and bail out.
+                 */
+
+                if (!hard_conflict)
+                        break;
+
+                if (acd->defend == N_ACD_DEFEND_NEVER) {
+                        n_acd_remember_conflict(acd, now);
+                        n_acd_stop(acd);
+                        r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+                        if (r)
+                                return r;
+                } else {
+                        if (now > acd->last_defend + N_ACD_RFC_DEFEND_INTERVAL_USEC) {
+                                r = n_acd_send(acd, &acd->config.ip);
+                                if (r < 0)
+                                        return r;
+
+                                acd->last_defend = now;
+                                r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+                                if (r)
+                                        return r;
+                        } else if (acd->defend == N_ACD_DEFEND_ONCE) {
+                                n_acd_remember_conflict(acd, now);
+                                n_acd_stop(acd);
+                                r = n_acd_push_event(acd, N_ACD_EVENT_CONFLICT, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+                                if (r)
+                                        return r;
+                        } else {
+                                r = n_acd_push_event(acd, N_ACD_EVENT_DEFENDED, &packet->ea_hdr.ar_op, &packet->arp_sha, &packet->arp_tpa);
+                                if (r)
+                                        return r;
+                        }
+                }
+
+                break;
+
+        case N_ACD_STATE_INIT:
+        default:
+                /*
+                 * The socket should not be dispatched in those states, since
+                 * it is neither allocated nor added to epoll. Fail hard if we
+                 * trigger this somehow.
+                 */
+                return -EIO;
+        }
+
+        return 0;
+}
+
+static int n_acd_dispatch_timer(NAcd *acd, struct epoll_event *event) {
+        uint64_t v;
+        int r;
+
+        if (event->events & (EPOLLHUP | EPOLLERR)) {
+                /*
+                 * There is no way to handle either gracefully. If we ignored
+                 * them, we would busy-loop, so lets rather forward the error
+                 * to the caller.
+                 */
+                return -EIO;
+        }
+
+        if (event->events & EPOLLIN) {
+                for (unsigned int i = 0; i < 128; ++i) {
+                        r = read(acd->fd_timer, &v, sizeof(v));
+                        if (r == sizeof(v)) {
+                                /*
+                                 * We successfully read a timer-value. Handle it and
+                                 * return. We do NOT fall-through to EPOLLHUP handling,
+                                 * as we always must drain buffers first.
+                                 */
+                                return n_acd_handle_timeout(acd);
+                        } else if (r >= 0) {
+                                /*
+                                 * Kernel guarantees 8-byte reads; fail hard if it
+                                 * suddenly starts doing weird shit. No clue what to do
+                                 * with those values, anyway.
+                                 */
+                                return -EIO;
+                        } else if (errno == EAGAIN) {
+                                /*
+                                 * No more pending events.
+                                 */
+                                return 0;
+                        } else {
+                                /*
+                                 * Something failed. We use CLOCK_BOOTTIME, so
+                                 * ECANCELED cannot happen. Hence, there is no error
+                                 * that we could gracefully handle. Fail hard and let
+                                 * the caller deal with it.
+                                 */
+                                return -n_acd_errno();
+                        }
+                }
+
+                return N_ACD_E_PREEMPTED;
+        }
+
+        return 0;
+}
+
+static int n_acd_dispatch_socket(NAcd *acd, struct epoll_event *event) {
+        struct ether_arp packet;
+        ssize_t l;
+
+        for (unsigned int i = 0; i < 128; ++i) {
+                /*
+                 * Regardless whether EPOLLIN is set in @event->events, we always
+                 * invoke recv(2). This is a safety-net for sockets, which always fetch
+                 * queued errors on all syscalls. That means, if anything failed on the
+                 * socket, we will be notified via recv(2). This simplifies the code
+                 * and avoid magic EPOLLIN/ERR/HUP juggling.
+                 *
+                 * Note that we must use recv(2) over read(2), since the latter cannot
+                 * deal with empty packets properly.
+                 */
+                l = recv(acd->fd_socket, &packet, sizeof(packet), MSG_TRUNC);
+                if (l == (ssize_t)sizeof(packet)) {
+                        /*
+                         * We read a full ARP packet. We never fall-through to EPOLLHUP
+                         * handling, as we always must drain buffers first.
+                         */
+                        return n_acd_handle_packet(acd, &packet);
+                } else if (l >= 0) {
+                        /*
+                         * The BPF filter discards wrong packets, so error out
+                         * if something slips through for any reason. Don't silently
+                         * ignore it, since we explicitly want to know if something
+                         * went fishy.
+                         */
+                        return -EIO;
+                } else if (errno == ENETDOWN || errno == ENXIO) {
+                        /*
+                         * We get ENETDOWN if the network-device goes down or is
+                         * removed. ENXIO might happen on async send-operations if the
+                         * network-device was unplugged and thus the kernel is no
+                         * longer aware of it.
+                         * In any case, we do not allow proceeding with this socket. We
+                         * stop the engine and notify the user gracefully.
+                         */
+                        return -N_ACD_E_DOWN;
+                } else if (errno == EAGAIN) {
+                        /*
+                         * We cannot read data from the socket (we got EAGAIN). As a safety net
+                         * check for EPOLLHUP/ERR. Those cannot be disabled with epoll, so we
+                         * must make sure to not busy-loop by ignoring them. Note that we know
+                         * recv(2) on sockets to return an error if either of these epoll-flags
+                         * is set. Hence, if we did not handle it above, we have no other way
+                         * but treating those flags as fatal errors and returning them to the
+                         * caller.
+                         */
+                        if (event->events & (EPOLLHUP | EPOLLERR))
+                                return -EIO;
+
+                        return 0;
+                } else {
+                        /*
+                         * Cannot dispatch the packet. This might be due to OOM, HUP,
+                         * or something else. We cannot handle it gracefully so forward
+                         * to the caller.
+                         */
+                        return -n_acd_errno();
+                }
+        }
+
+        return N_ACD_E_PREEMPTED;
+}
+
+/**
+ * n_acd_dispatch() - dispatch ACD context
+ * @acd:        ACD context
+ *
+ * Return: 0 on successful dispatch of all pending events, N_ACD_E_PREEMPT in
+ *         case there are more still more events to be dispatched, or a
+ *         negative error code on failure.
+ */
+_public_ int n_acd_dispatch(NAcd *acd) {
+        struct epoll_event events[2];
+        int n, i, r = 0;
+        bool preempted = false;
+
+        n = epoll_wait(acd->fd_epoll, events, sizeof(events) / sizeof(*events), 0);
+        if (n < 0) {
+                return -n_acd_errno();
+        }
+
+        for (i = 0; i < n; ++i) {
+                switch (events[i].data.u32) {
+                case N_ACD_EPOLL_TIMER:
+                        r = n_acd_dispatch_timer(acd, events + i);
+                        break;
+                case N_ACD_EPOLL_SOCKET:
+                        r = n_acd_dispatch_socket(acd, events + i);
+                        break;
+                default:
+                        r = 0;
+                        break;
+                }
+
+                if (r == N_ACD_E_PREEMPTED)
+                        preempted = true;
+                else if (r != 0)
+                        break;
+        }
+
+        if (r == -N_ACD_E_DOWN) {
+                /*
+                 * N_ACD_E_DOWN is synthesized whenever we notice
+                 * ENETDOWN-related errors on the network interface. This
+                 * allows bailing out of deep call-paths and then handling the
+                 * error gracefully here.
+                 */
+                n_acd_stop(acd);
+                r = n_acd_push_event(acd, N_ACD_EVENT_DOWN, NULL, NULL, NULL);
+                if (r)
+                        return r;
+
+                return 0;
+        }
+
+        if (preempted)
+                return N_ACD_E_PREEMPTED;
+        else
+                return r;
+}
+
+/**
+ * n_acd_pop_event() - get the next pending event
+ * @acd:        ACD context
+ * @eventp:     output argument for the event
+ *
+ * Returns a pointer to the next pending event. The event is still owend by
+ * the context, and is only valid until the next call to n_acd_pop_event()
+ * or until the context is freed.
+ *
+ * The possible events are:
+ *  * N_ACD_EVENT_READY:    The configured IP address was probed successfully
+ *                          and is ready to be used. Once configured on the
+ *                          interface, the caller must call n_acd_announce()
+ *                          to announce and start defending the address.
+ *                          No further events may be received before
+ *                          n_acd_announce() has been called.
+ *  * N_ACD_EVENT_USED:     Someone is already using the IP address being
+ *                          probed. The engine was stopped, and the caller
+ *                          may restart it to try again.
+ *  * N_ACD_EVENT_DEFENDED: A conflict was detected for the announced IP
+ *                          address, and the engine attempted to defend it.
+ *                          This is purely informational, and no action is
+ *                          required by the caller.
+ *  * N_ACD_EVENT_CONFLICT: A conflict was detected for the announced IP
+ *                          address, and the engine was not able to defend
+ *                          it (according to the configured policy). The
+ *                          engine has stoppde, the caller must stop using
+ *                          the address immediately, and may restart the
+ *                          engine to retry.
+ *  * N_ACD_EVENT_DOWN:     A network error was detected. The engine was
+ *                          stopped and it is the responsibility of the
+ *                          caller to restart it once the network may be
+ *                          functional again.
+ *
+ * Returns: 0 on success, N_ACD_E_STOPPED if there are no more events and
+ *          the engine has been stopped, N_ACD_E_DONE if there are no more
+ *          events, but the engine is still running, or a negative error
+ *          code on failure.
+ */
+_public_ int n_acd_pop_event(NAcd *acd, NAcdEvent **eventp) {
+        acd->current = n_acd_event_node_free(acd->current);
+
+        if (c_list_is_empty(&acd->events)) {
+                if (acd->state == N_ACD_STATE_INIT)
+                        return N_ACD_E_STOPPED;
+                else
+                        return N_ACD_E_DONE;
+        }
+
+        acd->current = c_list_first_entry(&acd->events, NAcdEventNode, link);
+        c_list_unlink(&acd->current->link);
+
+        if (eventp)
+                *eventp = &acd->current->event;
+
+        return 0;
+}
+
+static int n_acd_bind_socket(NAcd *acd, int s) {
+        /*
+         * Due to strict aliasing, we cannot get uint32_t/uint16_t pointers to
+         * acd->config.mac, so provide a union accessor.
+         */
+        const union {
+                uint8_t u8[6];
+                uint16_t u16[3];
+                uint32_t u32[1];
+        } mac = {
+                .u8 = {
+                        acd->mac[0],
+                        acd->mac[1],
+                        acd->mac[2],
+                        acd->mac[3],
+                        acd->mac[4],
+                        acd->mac[5],
+                },
+        };
+        struct sock_filter filter[] = {
+                /*
+                 * Basic ARP header validation. Make sure the packet-length,
+                 * wire type, protocol type, and address lengths are correct.
+                 */
+                BPF_STMT(BPF_LD + BPF_W + BPF_LEN, 0),                                                          /* A <- packet length */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_arp), 1, 0),                            /* packet == arp packet ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+                BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hrd)),                  /* A <- header */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPHRD_ETHER, 1, 0),                                        /* header == ethernet ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+                BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pro)),                  /* A <- protocol */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 1, 0),                                        /* protocol == IP ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+                BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_hln)),                  /* A <- hardware address length */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct ether_addr), 1, 0),                           /* length == sizeof(ether_addr)? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+                BPF_STMT(BPF_LD + BPF_B + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_pln)),                  /* A <- protocol address length */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, sizeof(struct in_addr), 1, 0),                              /* length == sizeof(in_addr) ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+                BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, ea_hdr.ar_op)),                   /* A <- operation */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REQUEST, 2, 0),                                       /* protocol == request ? */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ARPOP_REPLY, 1, 0),                                         /* protocol == reply ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+
+                /*
+                 * Sender hardware address must be different from ours. Note
+                 * that BPF runs in big-endian mode, but assumes immediates are
+                 * given in native-endian. This might look weird on 6-byte mac
+                 * addresses, but is needed to revert the BPF magic.
+                 */
+                BPF_STMT(BPF_LD + BPF_IMM, be32toh(mac.u32[0])),                                                /* A <- 4 bytes of client's MAC */
+                BPF_STMT(BPF_MISC + BPF_TAX, 0),                                                                /* X <- A */
+                BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_sha)),                        /* A <- 4 bytes of SHA */
+                BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0),                                                         /* A xor X */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 6),                                                   /* A == 0 ? */
+                BPF_STMT(BPF_LD + BPF_IMM, be16toh(mac.u16[2])),                                                /* A <- remainder of client's MAC */
+                BPF_STMT(BPF_MISC + BPF_TAX, 0),                                                                /* X <- A */
+                BPF_STMT(BPF_LD + BPF_H + BPF_ABS, offsetof(struct ether_arp, arp_sha) + 4),                    /* A <- remainder of SHA */
+                BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0),                                                         /* A xor X */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1),                                                   /* A == 0 ? */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+
+                /*
+                 * Sender protocol address or target protocol address must be
+                 * equal to the one we care about. Again, immediates must be
+                 * given in native-endian.
+                 */
+                BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)),                                     /* A <- clients IP */
+                BPF_STMT(BPF_MISC + BPF_TAX, 0),                                                                /* X <- A */
+                BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_spa)),                        /* A <- SPA */
+                BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0),                                                         /* X xor A */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1),                                                   /* A == 0 ? */
+                BPF_STMT(BPF_RET + BPF_K, 65535),                                                               /* return all */
+                BPF_STMT(BPF_LD + BPF_IMM, be32toh(acd->config.ip.s_addr)),                                     /* A <- clients IP */
+                BPF_STMT(BPF_MISC + BPF_TAX, 0),                                                                /* X <- A */
+                BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct ether_arp, arp_tpa)),                        /* A <- TPA */
+                BPF_STMT(BPF_ALU + BPF_XOR + BPF_X, 0),                                                         /* X xor A */
+                BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1),                                                   /* A == 0 ? */
+                BPF_STMT(BPF_RET + BPF_K, 65535),                                                               /* return all */
+                BPF_STMT(BPF_RET + BPF_K, 0),                                                                   /* ignore */
+        };
+        const struct sock_fprog fprog = {
+                .len = sizeof(filter) / sizeof(*filter),
+                .filter = filter,
+        };
+        const struct sockaddr_ll address = {
+                .sll_family = AF_PACKET,
+                .sll_protocol = htobe16(ETH_P_ARP),
+                .sll_ifindex = acd->config.ifindex,
+                .sll_halen = ETH_ALEN,
+                .sll_addr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+        };
+        int r;
+
+        /*
+         * Install a packet filter that matches on the ARP header and
+         * addresses, to reduce the number of wake-ups to a minimum.
+         */
+        r = setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &fprog, sizeof(fprog));
+        if (r < 0)
+                return -n_acd_errno();
+
+        /*
+         * Bind the packet-socket to ETH_P_ARP and the specified network
+         * interface.
+         */
+        r = bind(s, (struct sockaddr *)&address, sizeof(address));
+        if (r < 0)
+                return -n_acd_errno();
+
+        return 0;
+}
+
+static int n_acd_setup_socket(NAcd *acd) {
+        int r, s;
+
+        s = socket(PF_PACKET, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0);
+        if (s < 0)
+                return -n_acd_errno();
+
+        r = n_acd_bind_socket(acd, s);
+        if (r < 0)
+                goto error;
+
+        r = epoll_ctl(acd->fd_epoll, EPOLL_CTL_ADD, s,
+                      &(struct epoll_event){
+                              .events = EPOLLIN,
+                              .data.u32 = N_ACD_EPOLL_SOCKET,
+                      });
+        if (r < 0) {
+                r = -n_acd_errno();
+                goto error;
+        }
+
+        acd->fd_socket = s;
+        return 0;
+
+error:
+        close(s);
+        return r;
+}
+
+/**
+ * n_acd_start() - start the ACD engine
+ * @acd:        ACD context
+ * @config:     description of interface and desired IP address
+ *
+ * Start probing the given address on the given interface.
+ *
+ * The engine must not already be running, and there must not be
+ * any pending events.
+ *
+ * Returns: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the configuration
+ *          was invalid, N_ACD_E_BUSY if the engine is running or there are
+ *          pending events, or a negative error code on failure.
+ */
+_public_ int n_acd_start(NAcd *acd, NAcdConfig *config) {
+        uint64_t now, delay;
+        int r;
+
+        if (config->ifindex <= 0 ||
+            config->transport != N_ACD_TRANSPORT_ETHERNET ||
+            config->n_mac != ETH_ALEN ||
+            !memcmp(config->mac, (uint8_t[ETH_ALEN]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ETH_ALEN) ||
+            !config->ip.s_addr ||
+            !config->timeout_msec)
+                return N_ACD_E_INVALID_ARGUMENT;
+
+        if (acd->state != N_ACD_STATE_INIT || !c_list_is_empty(&acd->events))
+                return N_ACD_E_BUSY;
+
+        acd->config = *config;
+        memcpy(acd->mac, config->mac, config->n_mac);
+        acd->config.mac = acd->mac;
+        acd->timeout_multiplier = config->timeout_msec;
+
+        r = n_acd_setup_socket(acd);
+        if (r < 0)
+                goto error;
+
+        delay = 0;
+        if (acd->last_conflict != TIME_INFINITY) {
+                r = n_acd_now(&now);
+                if (r < 0)
+                        goto error;
+
+                if (now < acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC)
+                        delay = acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC - now;
+        }
+
+        r = n_acd_schedule(acd, delay, acd->timeout_multiplier * N_ACD_RFC_PROBE_WAIT_USEC);
+        if (r < 0)
+                goto error;
+
+        acd->state = N_ACD_STATE_PROBING;
+        acd->defend = N_ACD_DEFEND_NEVER;
+        acd->n_iteration = 0;
+        acd->last_defend = 0;
+        return 0;
+
+error:
+        n_acd_stop(acd);
+        return r;
+}
+
+/**
+ * n_acd_stop() - stop the ACD engine
+ * @acd:        ACD context
+ *
+ * Stop the engine. No new events may be triggered, but pending events are not
+ * flushed. Before calling n_acd_start() again all pending events must be popped.
+ */
+_public_ void n_acd_stop(NAcd *acd) {
+        acd->state = N_ACD_STATE_INIT;
+        acd->defend = N_ACD_DEFEND_NEVER;
+        acd->n_iteration = 0;
+        acd->last_defend = 0;
+        timerfd_settime(acd->fd_timer, 0, &(struct itimerspec){}, NULL);
+
+        if (acd->fd_socket >= 0) {
+                assert(acd->fd_epoll >= 0);
+                epoll_ctl(acd->fd_epoll, EPOLL_CTL_DEL, acd->fd_socket, NULL);
+                close(acd->fd_socket);
+                acd->fd_socket = -1;
+        }
+}
+
+/**
+ * n_acd_announce() - announce the configured IP address
+ * @acd:        ACD context
+ * @defend:     defence policy
+ *
+ * Announce the IP address on the local link, and start defending it according
+ * to the given policy, which mut be one of N_ACD_DEFEND_ONCE,
+ * N_ACD_DEFEND_NEVER, or N_ACD_DEFEND_ALWAYS.
+ *
+ * This must be called after the engine in response to an N_ACD_EVENT_READY
+ * event, and only after the given address has been configured on the given
+ * interface.
+ *
+ * Return: 0 on success, N_ACD_E_INVALID_ARGUMENT in case the defence policy
+ *         is invalid, N_ACD_E_BUSY if this is not in response to a
+ *         N_ACD_EVENT_READY event, or a negative error code on failure.
+ */
+_public_ int n_acd_announce(NAcd *acd, unsigned int defend) {
+        uint64_t now;
+        int r;
+
+        if (defend >= _N_ACD_DEFEND_N)
+                return N_ACD_E_INVALID_ARGUMENT;
+        if (acd->state != N_ACD_STATE_CONFIGURING)
+                return N_ACD_E_BUSY;
+
+        /*
+         * Sending announcements means we finished probing and use the address
+         * now. We therefore reset the conflict counter in case we adhered to
+         * the rate-limit. Since probing is properly delayed, a well-behaving
+         * client will always reset the conflict counter here. However, if you
+         * force-use an address regardless of conflicts, then this will not
+         * trigger and the conflict counter stays untouched.
+         */
+        if (acd->last_conflict != TIME_INFINITY) {
+                r = n_acd_now(&now);
+                if (r < 0)
+                        return r;
+
+                if (now >= acd->last_conflict + N_ACD_RFC_RATE_LIMIT_INTERVAL_USEC)
+                        acd->n_conflicts = 0;
+        }
+
+        /*
+         * Instead of sending the first announcement here, we schedule an idle
+         * timer. This avoids possibly recursing into the user callback. We
+         * should never trigger callbacks from arbitrary stacks, but always
+         * restrict them to the dispatcher.
+         */
+        r = n_acd_schedule(acd, 0, 0);
+        if (r < 0)
+                return r;
+
+        acd->state = N_ACD_STATE_ANNOUNCING;
+        acd->defend = defend;
+        acd->n_iteration = 0;
+        return 0;
+}