/* Simple prototype Xen Store Daemon providing simple tree-like database. Copyright (C) 2005 Rusty Russell IBM Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; If not, see . */ #include #include #include #include #ifndef NO_SOCKETS #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "utils.h" #include "list.h" #include "talloc.h" #include "xs_lib.h" #include "xenstored_core.h" #include "xenstored_watch.h" #include "xenstored_transaction.h" #include "xenstored_domain.h" #include "xenstored_control.h" #include "tdb.h" #ifndef NO_SOCKETS #if defined(HAVE_SYSTEMD) #define XEN_SYSTEMD_ENABLED 1 #endif #endif #if defined(XEN_SYSTEMD_ENABLED) #include #endif extern xenevtchn_handle *xce_handle; /* in xenstored_domain.c */ static int xce_pollfd_idx = -1; static struct pollfd *fds; static unsigned int current_array_size; static unsigned int nr_fds; static unsigned int delayed_requests; static int sock = -1; int orig_argc; char **orig_argv; static bool verbose = false; LIST_HEAD(connections); int tracefd = -1; static bool recovery = true; bool keep_orphans = false; static int reopen_log_pipe[2]; static int reopen_log_pipe0_pollfd_idx = -1; char *tracefile = NULL; TDB_CONTEXT *tdb_ctx = NULL; unsigned int trace_flags = TRACE_OBJ | TRACE_IO; static const char *sockmsg_string(enum xsd_sockmsg_type type); int quota_nb_entry_per_domain = 1000; int quota_nb_watch_per_domain = 128; int quota_max_entry_size = 2048; /* 2K */ int quota_max_transaction = 10; int quota_nb_perms_per_node = 5; int quota_trans_nodes = 1024; int quota_max_path_len = XENSTORE_REL_PATH_MAX; int quota_req_outstanding = 20; int quota_memory_per_domain_soft = 2 * 1024 * 1024; /* 2 MB */ int quota_memory_per_domain_hard = 2 * 1024 * 1024 + 512 * 1024; /* 2.5 MB */ unsigned int timeout_watch_event_msec = 20000; void trace(const char *fmt, ...) { va_list arglist; char *str; char sbuf[1024]; int ret, dummy; if (tracefd < 0) return; /* try to use a static buffer */ va_start(arglist, fmt); ret = vsnprintf(sbuf, 1024, fmt, arglist); va_end(arglist); if (ret <= 1024) { dummy = write(tracefd, sbuf, ret); return; } /* fail back to dynamic allocation */ va_start(arglist, fmt); str = talloc_vasprintf(NULL, fmt, arglist); va_end(arglist); if (str) { dummy = write(tracefd, str, strlen(str)); talloc_free(str); } } static void trace_io(const struct connection *conn, const struct buffered_data *data, int out) { unsigned int i; time_t now; struct tm *tm; if (tracefd < 0 || !(trace_flags & TRACE_IO)) return; now = time(NULL); tm = localtime(&now); trace("io: %s %p %04d%02d%02d %02d:%02d:%02d %s (", out ? "OUT" : "IN", conn, tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec, sockmsg_string(data->hdr.msg.type)); for (i = 0; i < data->hdr.msg.len; i++) trace("%c", (data->buffer[i] != '\0') ? data->buffer[i] : ' '); trace(")\n"); } void trace_create(const void *data, const char *type) { if (trace_flags & TRACE_OBJ) trace("obj: CREATE %s %p\n", type, data); } void trace_destroy(const void *data, const char *type) { if (trace_flags & TRACE_OBJ) trace("obj: DESTROY %s %p\n", type, data); } /** * Signal handler for SIGHUP, which requests that the trace log is reopened * (in the main loop). A single byte is written to reopen_log_pipe, to awaken * the poll() in the main loop. */ static void trigger_reopen_log(int signal __attribute__((unused))) { char c = 'A'; int dummy; dummy = write(reopen_log_pipe[1], &c, 1); } void close_log(void) { if (tracefd >= 0) close(tracefd); tracefd = -1; } void reopen_log(void) { if (tracefile) { close_log(); tracefd = open(tracefile, O_WRONLY | O_CREAT | O_APPEND | O_CLOEXEC, 0600); if (tracefd < 0) perror("Could not open tracefile"); else trace("\n***\n"); } } static uint64_t get_now_msec(void) { struct timespec now_ts; if (clock_gettime(CLOCK_MONOTONIC, &now_ts)) barf_perror("Could not find time (clock_gettime failed)"); return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; } /* * Remove a struct buffered_data from the list of outgoing data. * A struct buffered_data related to a request having caused watch events to be * sent is kept until all those events have been written out. * Each watch event is referencing the related request via pend.req, while the * number of watch events caused by a request is kept in pend.ref.event_cnt * (those two cases are mutually exclusive, so the two fields can share memory * via a union). * The struct buffered_data is freed only if no related watch event is * referencing it. The related return data can be freed right away. */ static void free_buffered_data(struct buffered_data *out, struct connection *conn) { struct buffered_data *req; list_del(&out->list); out->on_out_list = false; /* * Update conn->timeout_msec with the next found timeout value in the * queued pending requests. */ if (out->timeout_msec) { conn->timeout_msec = 0; list_for_each_entry(req, &conn->out_list, list) { if (req->timeout_msec) { conn->timeout_msec = req->timeout_msec; break; } } } domain_memory_add_nochk(conn->id, -out->hdr.msg.len - sizeof(out->hdr)); if (out->hdr.msg.type == XS_WATCH_EVENT) { req = out->pend.req; if (req) { req->pend.ref.event_cnt--; if (!req->pend.ref.event_cnt && !req->on_out_list) { if (req->on_ref_list) { domain_outstanding_domid_dec( req->pend.ref.domid); list_del(&req->list); } talloc_free(req); } } } else if (out->pend.ref.event_cnt) { /* Hang out off from conn. */ talloc_steal(NULL, out); if (out->buffer != out->default_buffer) talloc_free(out->buffer); list_add(&out->list, &conn->ref_list); out->on_ref_list = true; return; } else domain_outstanding_dec(conn); talloc_free(out); } static void check_event_timeout(struct connection *conn, uint64_t msecs, int *ptimeout) { uint64_t delta; struct buffered_data *out, *tmp; if (!conn->timeout_msec) return; delta = conn->timeout_msec - msecs; if (conn->timeout_msec <= msecs) { delta = 0; list_for_each_entry_safe(out, tmp, &conn->out_list, list) { /* * Only look at buffers with timeout and no data * already written to the ring. */ if (out->timeout_msec && out->inhdr && !out->used) { if (out->timeout_msec > msecs) { conn->timeout_msec = out->timeout_msec; delta = conn->timeout_msec - msecs; break; } /* * Free out without updating conn->timeout_msec, * as the update is done in this loop already. */ out->timeout_msec = 0; trace("watch event path %s for domain %u timed out\n", out->buffer, conn->id); free_buffered_data(out, conn); } } if (!delta) { conn->timeout_msec = 0; return; } } if (*ptimeout == -1 || *ptimeout > delta) *ptimeout = delta; } void conn_free_buffered_data(struct connection *conn) { struct buffered_data *out; while ((out = list_top(&conn->out_list, struct buffered_data, list))) free_buffered_data(out, conn); conn->timeout_msec = 0; } static bool write_messages(struct connection *conn) { int ret; struct buffered_data *out; out = list_top(&conn->out_list, struct buffered_data, list); if (out == NULL) return true; if (out->inhdr) { if (verbose) xprintf("Writing msg %s (%.*s) out to %p\n", sockmsg_string(out->hdr.msg.type), out->hdr.msg.len, out->buffer, conn); ret = conn->funcs->write(conn, out->hdr.raw + out->used, sizeof(out->hdr) - out->used); if (ret < 0) return false; out->used += ret; if (out->used < sizeof(out->hdr)) return true; out->inhdr = false; out->used = 0; /* Second write might block if non-zero. */ if (out->hdr.msg.len && !conn->domain) return true; } ret = conn->funcs->write(conn, out->buffer + out->used, out->hdr.msg.len - out->used); if (ret < 0) return false; out->used += ret; if (out->used != out->hdr.msg.len) return true; trace_io(conn, out, 1); free_buffered_data(out, conn); return true; } static int undelay_request(void *_req) { struct delayed_request *req = _req; list_del(&req->list); delayed_requests--; return 0; } static void call_delayed(struct delayed_request *req) { if (req->func(req)) { undelay_request(req); talloc_set_destructor(req, NULL); } } int delay_request(struct connection *conn, struct buffered_data *in, bool (*func)(struct delayed_request *), void *data, bool no_quota_check) { struct delayed_request *req; /* * Only allow one request can be delayed for an unprivileged * connection. */ if (!no_quota_check && domain_is_unprivileged(conn) && !list_empty(&conn->delayed)) return ENOSPC; req = talloc(in, struct delayed_request); if (!req) return ENOMEM; /* For the case of connection being closed. */ talloc_set_destructor(req, undelay_request); req->in = in; req->func = func; req->data = data; delayed_requests++; list_add(&req->list, &conn->delayed); /* Unlink the request from conn if this is the current one */ if (conn->in == in) conn->in = NULL; return 0; } static int destroy_conn(void *_conn) { struct connection *conn = _conn; struct buffered_data *req; /* Flush outgoing if possible, but don't block. */ if (!conn->domain) { struct pollfd pfd; pfd.fd = conn->fd; pfd.events = POLLOUT; while (!list_empty(&conn->out_list) && poll(&pfd, 1, 0) == 1) if (!write_messages(conn)) break; close(conn->fd); } conn_free_buffered_data(conn); conn_delete_all_watches(conn); list_for_each_entry(req, &conn->ref_list, list) req->on_ref_list = false; if (conn->target) talloc_unlink(conn, conn->target); list_del(&conn->list); trace_destroy(conn, "connection"); return 0; } static bool conn_can_read(struct connection *conn) { if (conn->is_ignored) return false; if (!conn->funcs->can_read(conn)) return false; /* * For stalled connection, we want to process the pending * command as soon as live-update has aborted. */ if (conn->is_stalled) return !lu_is_pending(); return true; } static bool conn_can_write(struct connection *conn) { return !conn->is_ignored && conn->funcs->can_write(conn); } /* This function returns index inside the array if succeed, -1 if fail */ static int set_fd(int fd, short events) { int ret; if (current_array_size < nr_fds + 1) { struct pollfd *new_fds = NULL; unsigned long newsize; /* Round up to 2^8 boundary, in practice this just * make newsize larger than current_array_size. */ newsize = ROUNDUP(nr_fds + 1, 8); new_fds = realloc(fds, sizeof(struct pollfd)*newsize); if (!new_fds) goto fail; fds = new_fds; memset(&fds[0] + current_array_size, 0, sizeof(struct pollfd ) * (newsize-current_array_size)); current_array_size = newsize; } fds[nr_fds].fd = fd; fds[nr_fds].events = events; ret = nr_fds; nr_fds++; return ret; fail: syslog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd); return -1; } static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) { struct connection *conn; struct wrl_timestampt now; uint64_t msecs; if (fds) memset(fds, 0, sizeof(struct pollfd) * current_array_size); nr_fds = 0; /* In case of delayed requests pause for max 1 second. */ *ptimeout = delayed_requests ? 1000 : -1; if (sock != -1) *p_sock_pollfd_idx = set_fd(sock, POLLIN|POLLPRI); if (reopen_log_pipe[0] != -1) reopen_log_pipe0_pollfd_idx = set_fd(reopen_log_pipe[0], POLLIN|POLLPRI); if (xce_handle != NULL) xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle), POLLIN|POLLPRI); wrl_gettime_now(&now); wrl_log_periodic(now); msecs = get_now_msec(); list_for_each_entry(conn, &connections, list) { if (conn->domain) { wrl_check_timeout(conn->domain, now, ptimeout); check_event_timeout(conn, msecs, ptimeout); if (conn_can_read(conn) || (conn_can_write(conn) && !list_empty(&conn->out_list))) *ptimeout = 0; } else { short events = POLLIN|POLLPRI; if (!list_empty(&conn->out_list)) events |= POLLOUT; conn->pollfd_idx = set_fd(conn->fd, events); /* * For stalled connection, we want to process the * pending command as soon as live-update has aborted. */ if (conn->is_stalled && !lu_is_pending()) *ptimeout = 0; } } } void set_tdb_key(const char *name, TDB_DATA *key) { /* * Dropping const is fine here, as the key will never be modified * by TDB. */ key->dptr = (char *)name; key->dsize = strlen(name); } static void get_acc_data(TDB_DATA *key, struct node_account_data *acc) { TDB_DATA old_data; struct xs_tdb_record_hdr *hdr; if (acc->memory < 0) { old_data = tdb_fetch(tdb_ctx, *key); /* No check for error, as the node might not exist. */ if (old_data.dptr == NULL) { acc->memory = 0; } else { hdr = (void *)old_data.dptr; acc->memory = old_data.dsize; acc->domid = hdr->perms[0].id; } talloc_free(old_data.dptr); } } /* * Per-transaction nodes need to be accounted for the transaction owner. * Those nodes are stored in the data base with the transaction generation * count prepended (e.g. 123/local/domain/...). So testing for the node's * key not to start with "/" or "@" is sufficient. */ static unsigned int get_acc_domid(struct connection *conn, TDB_DATA *key, unsigned int domid) { return (!conn || key->dptr[0] == '/' || key->dptr[0] == '@') ? domid : conn->id; } int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, struct node_account_data *acc, bool no_quota_check) { struct xs_tdb_record_hdr *hdr = (void *)data->dptr; struct node_account_data old_acc = {}; unsigned int old_domid, new_domid; int ret; if (!acc) old_acc.memory = -1; else old_acc = *acc; get_acc_data(key, &old_acc); old_domid = get_acc_domid(conn, key, old_acc.domid); new_domid = get_acc_domid(conn, key, hdr->perms[0].id); /* * Don't check for ENOENT, as we want to be able to switch orphaned * nodes to new owners. */ if (old_acc.memory) domain_memory_add_nochk(old_domid, -old_acc.memory - key->dsize); ret = domain_memory_add(new_domid, data->dsize + key->dsize, no_quota_check); if (ret) { /* Error path, so no quota check. */ if (old_acc.memory) domain_memory_add_nochk(old_domid, old_acc.memory + key->dsize); return ret; } /* TDB should set errno, but doesn't even set ecode AFAICT. */ if (tdb_store(tdb_ctx, *key, *data, TDB_REPLACE) != 0) { domain_memory_add_nochk(new_domid, -data->dsize - key->dsize); /* Error path, so no quota check. */ if (old_acc.memory) domain_memory_add_nochk(old_domid, old_acc.memory + key->dsize); errno = EIO; return errno; } if (acc) { /* Don't use new_domid, as it might be a transaction node. */ acc->domid = hdr->perms[0].id; acc->memory = data->dsize; } return 0; } int do_tdb_delete(struct connection *conn, TDB_DATA *key, struct node_account_data *acc) { struct node_account_data tmp_acc; unsigned int domid; if (!acc) { acc = &tmp_acc; acc->memory = -1; } get_acc_data(key, acc); if (tdb_delete(tdb_ctx, *key)) { errno = EIO; return errno; } if (acc->memory) { domid = get_acc_domid(conn, key, acc->domid); domain_memory_add_nochk(domid, -acc->memory - key->dsize); } return 0; } /* * If it fails, returns NULL and sets errno. * Temporary memory allocations will be done with ctx. */ struct node *read_node(struct connection *conn, const void *ctx, const char *name) { TDB_DATA key, data; struct xs_tdb_record_hdr *hdr; struct node *node; int err; node = talloc(ctx, struct node); if (!node) { errno = ENOMEM; return NULL; } node->name = talloc_strdup(node, name); if (!node->name) { talloc_free(node); errno = ENOMEM; return NULL; } transaction_prepend(conn, name, &key); data = tdb_fetch(tdb_ctx, key); if (data.dptr == NULL) { if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) { node->generation = NO_GENERATION; err = access_node(conn, node, NODE_ACCESS_READ, NULL); errno = err ? : ENOENT; } else { log("TDB error on read: %s", tdb_errorstr(tdb_ctx)); errno = EIO; } goto error; } node->parent = NULL; talloc_steal(node, data.dptr); /* Datalen, childlen, number of permissions */ hdr = (void *)data.dptr; node->generation = hdr->generation; node->perms.num = hdr->num_perms; node->datalen = hdr->datalen; node->childlen = hdr->childlen; /* Permissions are struct xs_permissions. */ node->perms.p = hdr->perms; node->acc.domid = get_node_owner(node); node->acc.memory = data.dsize; if (domain_adjust_node_perms(node)) goto error; /* If owner is gone reset currently accounted memory size. */ if (node->acc.domid != get_node_owner(node)) node->acc.memory = 0; /* Data is binary blob (usually ascii, no nul). */ node->data = node->perms.p + hdr->num_perms; /* Children is strings, nul separated. */ node->children = node->data + node->datalen; if (access_node(conn, node, NODE_ACCESS_READ, NULL)) goto error; return node; error: talloc_free(node); return NULL; } static bool read_node_can_propagate_errno(void) { /* * 2 error cases for read_node() can always be propagated up: * ENOMEM, because this has nothing to do with the node being in the * data base or not, but is caused by a general lack of memory. * ENOSPC, because this is related to hitting quota limits which need * to be respected. */ return errno == ENOMEM || errno == ENOSPC; } int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, bool no_quota_check) { TDB_DATA data; void *p; struct xs_tdb_record_hdr *hdr; if (domain_adjust_node_perms(node)) return errno; data.dsize = sizeof(*hdr) + node->perms.num * sizeof(node->perms.p[0]) + node->datalen + node->childlen; if (!no_quota_check && domain_is_unprivileged(conn) && data.dsize >= quota_max_entry_size) { errno = ENOSPC; return errno; } data.dptr = talloc_size(node, data.dsize); if (!data.dptr) { errno = ENOMEM; return errno; } hdr = (void *)data.dptr; hdr->generation = node->generation; hdr->num_perms = node->perms.num; hdr->datalen = node->datalen; hdr->childlen = node->childlen; memcpy(hdr->perms, node->perms.p, node->perms.num * sizeof(*node->perms.p)); p = hdr->perms + node->perms.num; memcpy(p, node->data, node->datalen); p += node->datalen; memcpy(p, node->children, node->childlen); if (do_tdb_write(conn, key, &data, &node->acc, no_quota_check)) return EIO; return 0; } /* * Write the node. If the node is written, caller can find the key used in * node->key. This can later be used if the change needs to be reverted. */ static int write_node(struct connection *conn, struct node *node, bool no_quota_check) { int ret; if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) return errno; ret = write_node_raw(conn, &node->key, node, no_quota_check); if (ret && conn && conn->transaction) { /* * Reverting access_node() is hard, so just fail the * transaction. */ fail_transaction(conn->transaction); } return ret; } unsigned int perm_for_conn(struct connection *conn, const struct node_perms *perms) { unsigned int i; unsigned int mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER; /* Owners and tools get it all... */ if (!domain_is_unprivileged(conn) || perms->p[0].id == conn->id || (conn->target && perms->p[0].id == conn->target->id)) return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask; for (i = 1; i < perms->num; i++) if (!(perms->p[i].perms & XS_PERM_IGNORE) && (perms->p[i].id == conn->id || (conn->target && perms->p[i].id == conn->target->id))) return perms->p[i].perms & mask; return perms->p[0].perms & mask; } /* * Get name of node parent. * Temporary memory allocations are done with ctx. */ char *get_parent(const void *ctx, const char *node) { char *parent; char *slash = strrchr(node + 1, '/'); parent = slash ? talloc_asprintf(ctx, "%.*s", (int)(slash - node), node) : talloc_strdup(ctx, "/"); if (!parent) errno = ENOMEM; return parent; } /* * What do parents say? * Temporary memory allocations are done with ctx. */ static int ask_parents(struct connection *conn, const void *ctx, const char *name, unsigned int *perm) { struct node *node; do { name = get_parent(ctx, name); if (!name) return errno; node = read_node(conn, ctx, name); if (node) break; if (read_node_can_propagate_errno()) return errno; } while (!streq(name, "/")); /* No permission at root? We're in trouble. */ if (!node) { corrupt(conn, "No permissions file at root"); *perm = XS_PERM_NONE; return 0; } *perm = perm_for_conn(conn, &node->perms); return 0; } /* * We have a weird permissions system. You can allow someone into a * specific node without allowing it in the parents. If it's going to * fail, however, we don't want the errno to indicate any information * about the node. * Temporary memory allocations are done with ctx. */ static int errno_from_parents(struct connection *conn, const void *ctx, const char *node, int errnum, unsigned int perm) { unsigned int parent_perm = XS_PERM_NONE; /* We always tell them about memory failures. */ if (errnum == ENOMEM) return errnum; if (ask_parents(conn, ctx, node, &parent_perm)) return errno; if (parent_perm & perm) return errnum; return EACCES; } /* * If it fails, returns NULL and sets errno. * Temporary memory allocations are done with ctx. */ static struct node *get_node(struct connection *conn, const void *ctx, const char *name, unsigned int perm) { struct node *node; node = read_node(conn, ctx, name); /* If we don't have permission, we don't have node. */ if (node) { if ((perm_for_conn(conn, &node->perms) & perm) != perm) { errno = EACCES; node = NULL; } } /* Clean up errno if they weren't supposed to know. */ if (!node && !read_node_can_propagate_errno()) errno = errno_from_parents(conn, ctx, name, errno, perm); return node; } static struct buffered_data *new_buffer(void *ctx) { struct buffered_data *data; data = talloc_zero(ctx, struct buffered_data); if (data == NULL) return NULL; data->inhdr = true; return data; } /* Return length of string (including nul) at this offset. * If there is no nul, returns 0 for failure. */ unsigned int get_string(const struct buffered_data *data, unsigned int offset) { const char *nul; if (offset >= data->used) return 0; nul = memchr(data->buffer + offset, 0, data->used - offset); if (!nul) return 0; return nul - (data->buffer + offset) + 1; } /* Break input into vectors, return the number, fill in up to num of them. * Always returns the actual number of nuls in the input. Stores the * positions of the starts of the nul-terminated strings in vec. * Callers who use this and then rely only on vec[] will * ignore any data after the final nul. */ unsigned int get_strings(struct buffered_data *data, char *vec[], unsigned int num) { unsigned int off, i, len; off = i = 0; while ((len = get_string(data, off)) != 0) { if (i < num) vec[i] = data->buffer + off; i++; off += len; } return i; } static void send_error(struct connection *conn, int error) { unsigned int i; for (i = 0; error != xsd_errors[i].errnum; i++) { if (i == ARRAY_SIZE(xsd_errors) - 1) { eprintf("xenstored: error %i untranslatable", error); i = 0; /* EINVAL */ break; } } send_reply(conn, XS_ERROR, xsd_errors[i].errstring, strlen(xsd_errors[i].errstring) + 1); } void send_reply(struct connection *conn, enum xsd_sockmsg_type type, const void *data, unsigned int len) { struct buffered_data *bdata = conn->in; assert(type != XS_WATCH_EVENT); if ( len > XENSTORE_PAYLOAD_MAX ) { send_error(conn, E2BIG); return; } if (!bdata) return; bdata->inhdr = true; bdata->used = 0; bdata->timeout_msec = 0; bdata->watch_event = false; if (len <= DEFAULT_BUFFER_SIZE) { bdata->buffer = bdata->default_buffer; /* Don't check quota, path might be used for returning error. */ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); } else { bdata->buffer = talloc_array(bdata, char, len); if (!bdata->buffer || domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { send_error(conn, ENOMEM); return; } } conn->in = NULL; /* Update relevant header fields and fill in the message body. */ bdata->hdr.msg.type = type; bdata->hdr.msg.len = len; memcpy(bdata->buffer, data, len); /* Queue for later transmission. */ list_add_tail(&bdata->list, &conn->out_list); bdata->on_out_list = true; domain_outstanding_inc(conn); } /* * Send a watch event. * As this is not directly related to the current command, errors can't be * reported. */ void send_event(struct buffered_data *req, struct connection *conn, const char *path, const char *token) { struct buffered_data *bdata, *bd; unsigned int len; len = strlen(path) + 1 + strlen(token) + 1; /* Don't try to send over-long events. */ if (len > XENSTORE_PAYLOAD_MAX) return; bdata = new_buffer(conn); if (!bdata) return; bdata->buffer = talloc_array(bdata, char, len); if (!bdata->buffer) { talloc_free(bdata); return; } strcpy(bdata->buffer, path); strcpy(bdata->buffer + strlen(path) + 1, token); bdata->hdr.msg.type = XS_WATCH_EVENT; bdata->hdr.msg.len = len; /* * Check whether an identical event is pending already. * Special events are excluded from that check. */ if (path[0] != '@') { list_for_each_entry(bd, &conn->out_list, list) { if (bd->watch_event && bd->hdr.msg.len == len && !memcmp(bdata->buffer, bd->buffer, len)) { trace("dropping duplicate watch %s %s for domain %u\n", path, token, conn->id); talloc_free(bdata); return; } } } if (domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { talloc_free(bdata); return; } if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; if (!conn->timeout_msec) conn->timeout_msec = bdata->timeout_msec; } bdata->watch_event = true; bdata->pend.req = req; if (req) req->pend.ref.event_cnt++; /* Queue for later transmission. */ list_add_tail(&bdata->list, &conn->out_list); bdata->on_out_list = true; } /* Some routines (write, mkdir, etc) just need a non-error return */ void send_ack(struct connection *conn, enum xsd_sockmsg_type type) { send_reply(conn, type, "OK", sizeof("OK")); } static bool valid_chars(const char *node) { /* Nodes can have lots of crap. */ return (strspn(node, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789-/_@") == strlen(node)); } bool is_valid_nodename(const char *node) { int local_off = 0; unsigned int domid; /* Must start in /. */ if (!strstarts(node, "/")) return false; /* Cannot end in / (unless it's just "/"). */ if (strends(node, "/") && !streq(node, "/")) return false; /* No double //. */ if (strstr(node, "//")) return false; if (sscanf(node, "/local/domain/%5u/%n", &domid, &local_off) != 1) local_off = 0; if (strlen(node) > local_off + quota_max_path_len) return false; return valid_chars(node); } /* We expect one arg in the input: return NULL otherwise. * The payload must contain exactly one nul, at the end. */ const char *onearg(struct buffered_data *in) { if (!in->used || get_string(in, 0) != in->used) return NULL; return in->buffer; } static char *perms_to_strings(const void *ctx, const struct node_perms *perms, unsigned int *len) { unsigned int i; char *strings = NULL; char buffer[MAX_STRLEN(unsigned int) + 1]; for (*len = 0, i = 0; i < perms->num; i++) { if (!xs_perm_to_string(&perms->p[i], buffer, sizeof(buffer))) return NULL; strings = talloc_realloc(ctx, strings, char, *len + strlen(buffer) + 1); if (!strings) return NULL; strcpy(strings + *len, buffer); *len += strlen(buffer) + 1; } return strings; } char *canonicalize(struct connection *conn, const void *ctx, const char *node) { const char *prefix; if (!node || (node[0] == '/') || (node[0] == '@')) return (char *)node; prefix = get_implicit_path(conn); if (prefix) return talloc_asprintf(ctx, "%s/%s", prefix, node); return (char *)node; } static struct node *get_node_canonicalized(struct connection *conn, const void *ctx, const char *name, char **canonical_name, unsigned int perm) { char *tmp_name; if (!canonical_name) canonical_name = &tmp_name; *canonical_name = canonicalize(conn, ctx, name); if (!*canonical_name) return NULL; if (!is_valid_nodename(*canonical_name)) { errno = EINVAL; return NULL; } return get_node(conn, ctx, *canonical_name, perm); } static struct node *get_spec_node(struct connection *conn, const void *ctx, const char *name, char **canonical_name, unsigned int perm) { if (name[0] == '@') return get_node(conn, ctx, name, perm); return get_node_canonicalized(conn, ctx, name, canonical_name, perm); } static int send_directory(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node *node; node = get_node_canonicalized(conn, ctx, onearg(in), NULL, XS_PERM_READ); if (!node) return errno; send_reply(conn, XS_DIRECTORY, node->children, node->childlen); return 0; } static int send_directory_part(const void *ctx, struct connection *conn, struct buffered_data *in) { unsigned int off, len, maxlen, genlen; char *child, *data; struct node *node; char gen[24]; if (xs_count_strings(in->buffer, in->used) != 2) return EINVAL; /* First arg is node name. */ node = get_node_canonicalized(conn, ctx, in->buffer, NULL, XS_PERM_READ); if (!node) return errno; /* Second arg is childlist offset. */ off = atoi(in->buffer + strlen(in->buffer) + 1); genlen = snprintf(gen, sizeof(gen), "%"PRIu64, node->generation) + 1; /* Offset behind list: just return a list with an empty string. */ if (off >= node->childlen) { gen[genlen] = 0; send_reply(conn, XS_DIRECTORY_PART, gen, genlen + 1); return 0; } len = 0; maxlen = XENSTORE_PAYLOAD_MAX - genlen - 1; child = node->children + off; while (len + strlen(child) < maxlen) { len += strlen(child) + 1; child += strlen(child) + 1; if (off + len == node->childlen) break; } data = talloc_array(ctx, char, genlen + len + 1); if (!data) return ENOMEM; memcpy(data, gen, genlen); memcpy(data + genlen, node->children + off, len); if (off + len == node->childlen) { data[genlen + len] = 0; len++; } send_reply(conn, XS_DIRECTORY_PART, data, genlen + len); return 0; } static int do_read(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node *node; node = get_node_canonicalized(conn, ctx, onearg(in), NULL, XS_PERM_READ); if (!node) return errno; send_reply(conn, XS_READ, node->data, node->datalen); return 0; } /* Must not be / */ static char *basename(const char *name) { return strrchr(name, '/') + 1; } static int add_child(const void *ctx, struct node *parent, const char *name) { const char *base; unsigned int baselen; char *children; base = basename(name); baselen = strlen(base) + 1; children = talloc_array(ctx, char, parent->childlen + baselen); if (!children) return ENOMEM; memcpy(children, parent->children, parent->childlen); memcpy(children + parent->childlen, base, baselen); parent->children = children; parent->childlen += baselen; return 0; } static struct node *construct_node(struct connection *conn, const void *ctx, const char *name) { const char **names = NULL; unsigned int levels = 0; struct node *node = NULL; struct node *parent = NULL; const char *parentname = talloc_strdup(ctx, name); if (!parentname) return NULL; /* Walk the path up until an existing node is found. */ while (!parent) { names = talloc_realloc(ctx, names, const char *, levels + 1); if (!names) goto nomem; /* * names[0] is the name of the node to construct initially, * names[1] is its parent, and so on. */ names[levels] = parentname; parentname = get_parent(ctx, parentname); if (!parentname) return NULL; /* Try to read parent node until we found an existing one. */ parent = read_node(conn, ctx, parentname); if (!parent && (errno != ENOENT || !strcmp(parentname, "/"))) return NULL; levels++; } /* Walk the path down again constructing the missing nodes. */ for (; levels > 0; levels--) { /* Add child to parent. */ if (add_child(ctx, parent, names[levels - 1])) goto nomem; /* Allocate node */ node = talloc(ctx, struct node); if (!node) goto nomem; node->name = talloc_steal(node, names[levels - 1]); /* Inherit permissions, unpriv domains own what they create. */ node->perms.num = parent->perms.num; node->perms.p = talloc_memdup(node, parent->perms.p, node->perms.num * sizeof(*node->perms.p)); if (!node->perms.p) goto nomem; if (domain_is_unprivileged(conn)) node->perms.p[0].id = conn->id; /* No children, no data */ node->children = node->data = NULL; node->childlen = node->datalen = 0; node->acc.memory = 0; node->parent = parent; parent = node; } return node; nomem: errno = ENOMEM; return NULL; } static void destroy_node_rm(struct connection *conn, struct node *node) { if (streq(node->name, "/")) corrupt(NULL, "Destroying root node!"); do_tdb_delete(conn, &node->key, &node->acc); } static int destroy_node(struct connection *conn, struct node *node) { destroy_node_rm(conn, node); domain_nbentry_dec(conn, get_node_owner(node)); /* * It is not possible to easily revert the changes in a transaction. * So if the failure happens in a transaction, mark it as fail to * prevent any commit. */ if ( conn->transaction ) fail_transaction(conn->transaction); return 0; } static struct node *create_node(struct connection *conn, const void *ctx, const char *name, void *data, unsigned int datalen) { struct node *node, *i, *j; int ret; node = construct_node(conn, ctx, name); if (!node) return NULL; if (conn && conn->transaction) ta_node_created(conn->transaction); node->data = data; node->datalen = datalen; /* * We write out the nodes bottom up. * All new created nodes will have i->parent set, while the final * node will be already existing and won't have i->parent set. * New nodes are subject to quota handling. */ for (i = node; i; i = i->parent) { /* i->parent is set for each new node, so check quota. */ if (i->parent && domain_nbentry(conn) >= quota_nb_entry_per_domain) { ret = ENOSPC; goto err; } ret = write_node(conn, i, false); if (ret) goto err; /* Account for new node */ if (i->parent) { if (domain_nbentry_inc(conn, get_node_owner(i))) { destroy_node_rm(conn, i); return NULL; } } } return node; err: /* * We failed to update TDB for some of the nodes. Undo any work that * have already been done. */ for (j = node; j != i; j = j->parent) destroy_node(conn, j); /* We don't need to keep the nodes around, so free them. */ i = node; while (i) { j = i; i = i->parent; talloc_free(j); } errno = ret; return NULL; } /* path, data... */ static int do_write(const void *ctx, struct connection *conn, struct buffered_data *in) { unsigned int offset, datalen; struct node *node; char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */ char *name; /* Extra "strings" can be created by binary data. */ if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) return EINVAL; offset = strlen(vec[0]) + 1; datalen = in->used - offset; node = get_node_canonicalized(conn, ctx, vec[0], &name, XS_PERM_WRITE); if (!node) { /* No permissions, invalid input? */ if (errno != ENOENT) return errno; node = create_node(conn, ctx, name, in->buffer + offset, datalen); if (!node) return errno; } else { node->data = in->buffer + offset; node->datalen = datalen; if (write_node(conn, node, false)) return errno; } fire_watches(conn, ctx, name, node, false, NULL); send_ack(conn, XS_WRITE); return 0; } static int do_mkdir(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node *node; char *name; node = get_node_canonicalized(conn, ctx, onearg(in), &name, XS_PERM_WRITE); /* If it already exists, fine. */ if (!node) { /* No permissions? */ if (errno != ENOENT) return errno; if (!name) return ENOMEM; node = create_node(conn, ctx, name, NULL, 0); if (!node) return errno; fire_watches(conn, ctx, name, node, false, NULL); } send_ack(conn, XS_MKDIR); return 0; } /* Delete memory using memmove. */ static void memdel(void *mem, unsigned off, unsigned len, unsigned total) { memmove(mem + off, mem + off + len, total - off - len); } static int remove_child_entry(struct connection *conn, struct node *node, size_t offset) { size_t childlen = strlen(node->children + offset); memdel(node->children, offset, childlen + 1, node->childlen); node->childlen -= childlen + 1; return write_node(conn, node, true); } static int delete_child(struct connection *conn, struct node *node, const char *childname) { unsigned int i; for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { if (streq(node->children+i, childname)) { errno = remove_child_entry(conn, node, i) ? EIO : 0; return errno; } } corrupt(conn, "Can't find child '%s' in %s", childname, node->name); errno = EIO; return errno; } static int delnode_sub(const void *ctx, struct connection *conn, struct node *node, void *arg) { const char *root = arg; bool watch_exact; int ret; TDB_DATA key; /* Any error here will probably be repeated for all following calls. */ ret = access_node(conn, node, NODE_ACCESS_DELETE, &key); if (ret > 0) return WALK_TREE_SUCCESS_STOP; /* In case of error stop the walk. */ if (!ret && do_tdb_delete(conn, &key, &node->acc)) return WALK_TREE_SUCCESS_STOP; /* * Fire the watches now, when we can still see the node permissions. * This fine as we are single threaded and the next possible read will * be handled only after the node has been really removed. */ watch_exact = strcmp(root, node->name); fire_watches(conn, ctx, node->name, node, watch_exact, NULL); domain_nbentry_dec(conn, get_node_owner(node)); return WALK_TREE_RM_CHILDENTRY; } int rm_node(struct connection *conn, const void *ctx, const char *name) { struct node *parent; char *parentname = get_parent(ctx, name); struct walk_funcs walkfuncs = { .exit = delnode_sub }; int ret; if (!parentname) return errno; parent = read_node(conn, ctx, parentname); if (!parent) return read_node_can_propagate_errno() ? errno : EINVAL; ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name); if (ret < 0) { if (ret == WALK_TREE_ERROR_STOP) { corrupt(conn, "error when deleting sub-nodes of %s\n", name); errno = EIO; } return errno; } if (delete_child(conn, parent, basename(name))) return errno; return 0; } static int do_rm(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node *node; int ret; char *name; char *parentname; node = get_node_canonicalized(conn, ctx, onearg(in), &name, XS_PERM_WRITE); if (!node) { /* Didn't exist already? Fine, if parent exists. */ if (errno == ENOENT) { if (!name) return ENOMEM; parentname = get_parent(ctx, name); if (!parentname) return errno; node = read_node(conn, ctx, parentname); if (node) { send_ack(conn, XS_RM); return 0; } /* Restore errno, just in case. */ if (!read_node_can_propagate_errno()) errno = ENOENT; } return errno; } if (streq(name, "/")) return EINVAL; ret = rm_node(conn, ctx, name); if (ret) return ret; send_ack(conn, XS_RM); return 0; } static int do_get_perms(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node *node; char *strings; unsigned int len; node = get_spec_node(conn, ctx, onearg(in), NULL, XS_PERM_READ); if (!node) return errno; strings = perms_to_strings(node, &node->perms, &len); if (!strings) return errno; send_reply(conn, XS_GET_PERMS, strings, len); return 0; } static int do_set_perms(const void *ctx, struct connection *conn, struct buffered_data *in) { struct node_perms perms, old_perms; char *name, *permstr; struct node *node; perms.num = xs_count_strings(in->buffer, in->used); if (perms.num < 2) return EINVAL; perms.num--; if (domain_is_unprivileged(conn) && perms.num > quota_nb_perms_per_node) return ENOSPC; permstr = in->buffer + strlen(in->buffer) + 1; perms.p = talloc_array(ctx, struct xs_permissions, perms.num); if (!perms.p) return ENOMEM; if (!xs_strings_to_perms(perms.p, perms.num, permstr)) return errno; if (domain_alloc_permrefs(&perms) < 0) return ENOMEM; if (perms.p[0].perms & XS_PERM_IGNORE) return ENOENT; /* We must own node to do this (tools can do this too). */ node = get_spec_node(conn, ctx, in->buffer, &name, XS_PERM_WRITE | XS_PERM_OWNER); if (!node) return errno; /* Unprivileged domains may not change the owner. */ if (domain_is_unprivileged(conn) && perms.p[0].id != get_node_owner(node)) return EPERM; old_perms = node->perms; domain_nbentry_dec(conn, get_node_owner(node)); node->perms = perms; if (domain_nbentry_inc(conn, get_node_owner(node))) { node->perms = old_perms; /* * This should never fail because we had a reference on the * domain before and Xenstored is single-threaded. */ domain_nbentry_inc(conn, get_node_owner(node)); return ENOMEM; } if (write_node(conn, node, false)) { int saved_errno = errno; domain_nbentry_dec(conn, get_node_owner(node)); node->perms = old_perms; /* No failure possible as above. */ domain_nbentry_inc(conn, get_node_owner(node)); errno = saved_errno; return errno; } fire_watches(conn, ctx, name, node, false, &old_perms); send_ack(conn, XS_SET_PERMS); return 0; } static char *child_name(const void *ctx, const char *s1, const char *s2) { if (strcmp(s1, "/")) return talloc_asprintf(ctx, "%s/%s", s1, s2); return talloc_asprintf(ctx, "/%s", s2); } static int rm_from_parent(struct connection *conn, struct node *parent, const char *name) { size_t off; if (!parent) return WALK_TREE_ERROR_STOP; for (off = parent->childoff - 1; off && parent->children[off - 1]; off--); if (remove_child_entry(conn, parent, off)) { log("treewalk: child entry could not be removed from '%s'", parent->name); return WALK_TREE_ERROR_STOP; } parent->childoff = off; return WALK_TREE_OK; } static int walk_call_func(const void *ctx, struct connection *conn, struct node *node, struct node *parent, void *arg, int (*func)(const void *ctx, struct connection *conn, struct node *node, void *arg)) { int ret; if (!func) return WALK_TREE_OK; ret = func(ctx, conn, node, arg); if (ret == WALK_TREE_RM_CHILDENTRY && parent) ret = rm_from_parent(conn, parent, node->name); return ret; } int walk_node_tree(const void *ctx, struct connection *conn, const char *root, struct walk_funcs *funcs, void *arg) { int ret = 0; void *tmpctx; char *name; struct node *node = NULL; struct node *parent = NULL; tmpctx = talloc_new(ctx); if (!tmpctx) { errno = ENOMEM; return WALK_TREE_ERROR_STOP; } name = talloc_strdup(tmpctx, root); if (!name) { errno = ENOMEM; talloc_free(tmpctx); return WALK_TREE_ERROR_STOP; } /* Continue the walk until an error is returned. */ while (ret >= 0) { /* node == NULL possible only for the initial loop iteration. */ if (node) { /* Go one step up if ret or if last child finished. */ if (ret || node->childoff >= node->childlen) { parent = node->parent; /* Call function AFTER processing a node. */ ret = walk_call_func(ctx, conn, node, parent, arg, funcs->exit); /* Last node, so exit loop. */ if (!parent) break; talloc_free(node); /* Continue with parent. */ node = parent; continue; } /* Get next child of current node. */ name = child_name(tmpctx, node->name, node->children + node->childoff); if (!name) { ret = WALK_TREE_ERROR_STOP; break; } /* Point to next child. */ node->childoff += strlen(node->children + node->childoff) + 1; /* Descent into children. */ parent = node; } /* Read next node (root node or next child). */ node = read_node(conn, tmpctx, name); if (!node) { /* Child not found - should not happen! */ /* ENOENT case can be handled by supplied function. */ if (errno == ENOENT && funcs->enoent) ret = funcs->enoent(ctx, conn, parent, name, arg); else ret = WALK_TREE_ERROR_STOP; if (!parent) break; if (ret == WALK_TREE_RM_CHILDENTRY) ret = rm_from_parent(conn, parent, name); if (ret < 0) break; talloc_free(name); node = parent; continue; } talloc_free(name); node->parent = parent; node->childoff = 0; /* Call function BEFORE processing a node. */ ret = walk_call_func(ctx, conn, node, parent, arg, funcs->enter); } talloc_free(tmpctx); return ret < 0 ? ret : WALK_TREE_OK; } static struct { const char *str; int (*func)(const void *ctx, struct connection *conn, struct buffered_data *in); unsigned int flags; #define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */ #define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */ } const wire_funcs[XS_TYPE_COUNT] = { [XS_CONTROL] = { "CONTROL", do_control, XS_FLAG_PRIV }, [XS_DIRECTORY] = { "DIRECTORY", send_directory }, [XS_READ] = { "READ", do_read }, [XS_GET_PERMS] = { "GET_PERMS", do_get_perms }, [XS_WATCH] = { "WATCH", do_watch, XS_FLAG_NOTID }, [XS_UNWATCH] = { "UNWATCH", do_unwatch, XS_FLAG_NOTID }, [XS_TRANSACTION_START] = { "TRANSACTION_START", do_transaction_start }, [XS_TRANSACTION_END] = { "TRANSACTION_END", do_transaction_end }, [XS_INTRODUCE] = { "INTRODUCE", do_introduce, XS_FLAG_PRIV }, [XS_RELEASE] = { "RELEASE", do_release, XS_FLAG_PRIV }, [XS_GET_DOMAIN_PATH] = { "GET_DOMAIN_PATH", do_get_domain_path }, [XS_WRITE] = { "WRITE", do_write }, [XS_MKDIR] = { "MKDIR", do_mkdir }, [XS_RM] = { "RM", do_rm }, [XS_SET_PERMS] = { "SET_PERMS", do_set_perms }, [XS_WATCH_EVENT] = { "WATCH_EVENT", NULL }, [XS_ERROR] = { "ERROR", NULL }, [XS_IS_DOMAIN_INTRODUCED] = { "IS_DOMAIN_INTRODUCED", do_is_domain_introduced, XS_FLAG_PRIV }, [XS_RESUME] = { "RESUME", do_resume, XS_FLAG_PRIV }, [XS_SET_TARGET] = { "SET_TARGET", do_set_target, XS_FLAG_PRIV }, [XS_RESET_WATCHES] = { "RESET_WATCHES", do_reset_watches }, [XS_DIRECTORY_PART] = { "DIRECTORY_PART", send_directory_part }, }; static const char *sockmsg_string(enum xsd_sockmsg_type type) { if ((unsigned int)type < ARRAY_SIZE(wire_funcs) && wire_funcs[type].str) return wire_funcs[type].str; return "**UNKNOWN**"; } /* Process "in" for conn: "in" will vanish after this conversation, so * we can talloc off it for temporary variables. May free "conn". */ static void process_message(struct connection *conn, struct buffered_data *in) { struct transaction *trans; enum xsd_sockmsg_type type = in->hdr.msg.type; int ret; void *ctx; /* At least send_error() and send_reply() expects conn->in == in */ assert(conn->in == in); trace_io(conn, in, 0); if ((unsigned int)type >= XS_TYPE_COUNT || !wire_funcs[type].func) { eprintf("Client unknown operation %i", type); send_error(conn, ENOSYS); return; } if ((wire_funcs[type].flags & XS_FLAG_PRIV) && domain_is_unprivileged(conn)) { send_error(conn, EACCES); return; } trans = (wire_funcs[type].flags & XS_FLAG_NOTID) ? NULL : transaction_lookup(conn, in->hdr.msg.tx_id); if (IS_ERR(trans)) { send_error(conn, -PTR_ERR(trans)); return; } ctx = talloc_new(NULL); if (!ctx) { send_error(conn, ENOMEM); return; } assert(conn->transaction == NULL); conn->transaction = trans; ret = wire_funcs[type].func(ctx, conn, in); talloc_free(ctx); if (ret) send_error(conn, ret); conn->transaction = NULL; } static bool process_delayed_message(struct delayed_request *req) { struct connection *conn = req->data; struct buffered_data *saved_in = conn->in; if (lu_is_pending()) return false; /* * Part of process_message() expects conn->in to contains the * processed response. So save the current conn->in and restore it * afterwards. */ conn->in = req->in; process_message(req->data, req->in); conn->in = saved_in; return true; } static void consider_message(struct connection *conn) { if (verbose) xprintf("Got message %s len %i from %p\n", sockmsg_string(conn->in->hdr.msg.type), conn->in->hdr.msg.len, conn); conn->is_stalled = false; /* * Currently, Live-Update is not supported if there is active * transactions. In order to reduce the number of retry, delay * any new request to start a transaction if Live-Update is pending * and there are no transactions in-flight. * * If we can't delay the request, then mark the connection as * stalled. This will ignore new requests until Live-Update happened * or it was aborted. */ if (lu_is_pending() && conn->transaction_started == 0 && conn->in->hdr.msg.type == XS_TRANSACTION_START) { trace("Delaying transaction start for connection %p req_id %u\n", conn, conn->in->hdr.msg.req_id); if (delay_request(conn, conn->in, process_delayed_message, conn, false) != 0) { trace("Stalling connection %p\n", conn); conn->is_stalled = true; } return; } process_message(conn, conn->in); assert(conn->in == NULL); } /* * Errors in reading or allocating here means we get out of sync, so we mark * the connection as ignored. */ static void handle_input(struct connection *conn) { int bytes; struct buffered_data *in; unsigned int err; if (!conn->in) { conn->in = new_buffer(conn); /* In case of no memory just try it again next time. */ if (!conn->in) return; } in = conn->in; in->pend.ref.domid = conn->id; /* Not finished header yet? */ if (in->inhdr) { if (in->used != sizeof(in->hdr)) { bytes = conn->funcs->read(conn, in->hdr.raw + in->used, sizeof(in->hdr) - in->used); if (bytes < 0) { err = XENSTORE_ERROR_RINGIDX; goto bad_client; } in->used += bytes; if (in->used != sizeof(in->hdr)) return; if (in->hdr.msg.len > XENSTORE_PAYLOAD_MAX) { syslog(LOG_ERR, "Client tried to feed us %i", in->hdr.msg.len); err = XENSTORE_ERROR_PROTO; goto bad_client; } } if (in->hdr.msg.len <= DEFAULT_BUFFER_SIZE) in->buffer = in->default_buffer; else in->buffer = talloc_array(in, char, in->hdr.msg.len); /* In case of no memory just try it again next time. */ if (!in->buffer) return; in->used = 0; in->inhdr = false; } bytes = conn->funcs->read(conn, in->buffer + in->used, in->hdr.msg.len - in->used); if (bytes < 0) { err = XENSTORE_ERROR_RINGIDX; goto bad_client; } in->used += bytes; if (in->used != in->hdr.msg.len) return; consider_message(conn); return; bad_client: ignore_connection(conn, err); } static void handle_output(struct connection *conn) { /* Ignore the connection if an error occured */ if (!write_messages(conn)) ignore_connection(conn, XENSTORE_ERROR_RINGIDX); } struct connection *new_connection(const struct interface_funcs *funcs) { struct connection *new; new = talloc_zero(talloc_autofree_context(), struct connection); if (!new) return NULL; new->fd = -1; new->pollfd_idx = -1; new->funcs = funcs; new->is_ignored = false; new->is_stalled = false; new->transaction_started = 0; INIT_LIST_HEAD(&new->out_list); INIT_LIST_HEAD(&new->ref_list); INIT_LIST_HEAD(&new->watches); INIT_LIST_HEAD(&new->transaction_list); INIT_LIST_HEAD(&new->delayed); list_add_tail(&new->list, &connections); talloc_set_destructor(new, destroy_conn); trace_create(new, "connection"); return new; } struct connection *get_connection_by_id(unsigned int conn_id) { struct connection *conn; list_for_each_entry(conn, &connections, list) if (conn->conn_id == conn_id) return conn; return NULL; } #ifdef NO_SOCKETS static void accept_connection(int sock) { } #else static int writefd(struct connection *conn, const void *data, unsigned int len) { int rc; while ((rc = write(conn->fd, data, len)) < 0) { if (errno == EAGAIN) { rc = 0; break; } if (errno != EINTR) break; } return rc; } static int readfd(struct connection *conn, void *data, unsigned int len) { int rc; while ((rc = read(conn->fd, data, len)) < 0) { if (errno == EAGAIN) { rc = 0; break; } if (errno != EINTR) break; } /* Reading zero length means we're done with this connection. */ if ((rc == 0) && (len != 0)) { errno = EBADF; rc = -1; } return rc; } static bool socket_can_process(struct connection *conn, int mask) { if (conn->pollfd_idx == -1) return false; if (fds[conn->pollfd_idx].revents & ~(POLLIN | POLLOUT)) { talloc_free(conn); return false; } return (fds[conn->pollfd_idx].revents & mask); } static bool socket_can_write(struct connection *conn) { return socket_can_process(conn, POLLOUT); } static bool socket_can_read(struct connection *conn) { return socket_can_process(conn, POLLIN); } const struct interface_funcs socket_funcs = { .write = writefd, .read = readfd, .can_write = socket_can_write, .can_read = socket_can_read, }; static void accept_connection(int sock) { int fd; struct connection *conn; fd = accept(sock, NULL, NULL); if (fd < 0) return; conn = new_connection(&socket_funcs); if (conn) { conn->fd = fd; conn->id = dom0_domid; } else close(fd); } #endif static int tdb_flags = TDB_INTERNAL | TDB_NOLOCK; /* We create initial nodes manually. */ static void manual_node(const char *name, const char *child) { struct node *node; struct xs_permissions perms = { .id = dom0_domid, .perms = XS_PERM_NONE }; node = talloc_zero(NULL, struct node); if (!node) barf_perror("Could not allocate initial node %s", name); node->name = name; node->perms.p = &perms; node->perms.num = 1; node->children = (char *)child; if (child) node->childlen = strlen(child) + 1; if (write_node(NULL, node, false)) barf_perror("Could not create initial node %s", name); talloc_free(node); } static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...) { va_list ap; char *s; int saved_errno = errno; va_start(ap, fmt); s = talloc_vasprintf(NULL, fmt, ap); va_end(ap); if (s) { trace("TDB: %s\n", s); syslog(LOG_ERR, "TDB: %s", s); if (verbose) xprintf("TDB: %s", s); talloc_free(s); } else { trace("talloc failure during logging\n"); syslog(LOG_ERR, "talloc failure during logging\n"); } errno = saved_errno; } void setup_structure(bool live_update) { char *tdbname; tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb()); if (!tdbname) barf_perror("Could not create tdbname"); if (!(tdb_flags & TDB_INTERNAL)) unlink(tdbname); tdb_ctx = tdb_open_ex(tdbname, 7919, tdb_flags, O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC, 0640, &tdb_logger, NULL); if (!tdb_ctx) barf_perror("Could not create tdb file %s", tdbname); if (live_update) manual_node("/", NULL); else { manual_node("/", "tool"); manual_node("/tool", "xenstored"); manual_node("/tool/xenstored", NULL); manual_node("@releaseDomain", NULL); manual_node("@introduceDomain", NULL); domain_nbentry_fix(dom0_domid, 5, true); } } static unsigned int hash_from_key_fn(const void *k) { const char *str = k; unsigned int hash = 5381; char c; while ((c = *str++)) hash = ((hash << 5) + hash) + (unsigned int)c; return hash; } static int keys_equal_fn(const void *key1, const void *key2) { return 0 == strcmp(key1, key2); } int remember_string(struct hashtable *hash, const char *str) { char *k = talloc_strdup(NULL, str); if (!k) return 0; return hashtable_insert(hash, k, (void *)1); } /** * A node has a children field that names the children of the node, separated * by NULs. We check whether there are entries in there that are duplicated * (and if so, delete the second one), and whether there are any that do not * have a corresponding child node (and if so, delete them). Each valid child * is then recursively checked. * * No deleting is performed if the recovery flag is cleared (i.e. -R was * passed on the command line). * * As we go, we record each node in the given reachable hashtable. These * entries will be used later in clean_store. */ struct check_store_data { struct hashtable *reachable; struct hashtable *domains; }; static int check_store_step(const void *ctx, struct connection *conn, struct node *node, void *arg) { struct check_store_data *data = arg; if (hashtable_search(data->reachable, (void *)node->name)) { log("check_store: '%s' is duplicated!", node->name); return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_SKIP_CHILDREN; } if (!remember_string(data->reachable, node->name)) return WALK_TREE_ERROR_STOP; domain_check_acc_add(node, data->domains); return WALK_TREE_OK; } static int check_store_enoent(const void *ctx, struct connection *conn, struct node *parent, char *name, void *arg) { log("check_store: node '%s' not found", name); return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_OK; } /** * Helper to clean_store below. */ static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val, void *private) { struct hashtable *reachable = private; char *slash; char * name = talloc_strndup(NULL, key.dptr, key.dsize); if (!name) { log("clean_store: ENOMEM"); return 1; } if (name[0] != '/') { slash = strchr(name, '/'); if (slash) *slash = 0; } if (!hashtable_search(reachable, name)) { log("clean_store: '%s' is orphaned!", name); if (recovery) { do_tdb_delete(NULL, &key, NULL); } } talloc_free(name); return 0; } /** * Given the list of reachable nodes, iterate over the whole store, and * remove any that were not reached. */ static void clean_store(struct check_store_data *data) { tdb_traverse(tdb_ctx, &clean_store_, data->reachable); domain_check_acc(data->domains); } int check_store_path(const char *name, struct check_store_data *data) { struct node *node; node = read_node(NULL, NULL, name); if (!node) { log("check_store: error %d reading special node '%s'", errno, name); return errno; } return check_store_step(NULL, NULL, node, data); } void check_store(void) { struct walk_funcs walkfuncs = { .enter = check_store_step, .enoent = check_store_enoent, }; struct check_store_data data; /* Don't free values (they are all void *1) */ data.reachable = create_hashtable(NULL, 16, hash_from_key_fn, keys_equal_fn, HASHTABLE_FREE_KEY); if (!data.reachable) { log("check_store: ENOMEM"); return; } data.domains = domain_check_acc_init(); if (!data.domains) { log("check_store: ENOMEM"); goto out_hash; } log("Checking store ..."); if (walk_node_tree(NULL, NULL, "/", &walkfuncs, &data)) { if (errno == ENOMEM) log("check_store: ENOMEM"); } else if (!check_store_path("@introduceDomain", &data) && !check_store_path("@releaseDomain", &data) && !check_transactions(data.reachable)) clean_store(&data); log("Checking store complete."); hashtable_destroy(data.domains); out_hash: hashtable_destroy(data.reachable); } /* Something is horribly wrong: check the store. */ void corrupt(struct connection *conn, const char *fmt, ...) { va_list arglist; char *str; int saved_errno = errno; va_start(arglist, fmt); str = talloc_vasprintf(NULL, fmt, arglist); va_end(arglist); log("corruption detected by connection %i: err %s: %s", conn ? (int)conn->id : -1, strerror(saved_errno), str ?: "ENOMEM"); talloc_free(str); check_store(); errno = saved_errno; } #ifndef NO_SOCKETS static void destroy_fds(void) { if (sock >= 0) close(sock); } static void init_sockets(void) { struct sockaddr_un addr; const char *soc_str = xs_daemon_socket(); if (!soc_str) barf_perror("Failed to obtain xs domain socket"); /* Create sockets for them to listen to. */ atexit(destroy_fds); sock = socket(PF_UNIX, SOCK_STREAM, 0); if (sock < 0) barf_perror("Could not create socket"); /* FIXME: Be more sophisticated, don't mug running daemon. */ unlink(soc_str); addr.sun_family = AF_UNIX; if(strlen(soc_str) >= sizeof(addr.sun_path)) barf_perror("socket string '%s' too long", soc_str); strcpy(addr.sun_path, soc_str); if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) != 0) barf_perror("Could not bind socket to %s", soc_str); if (chmod(soc_str, 0600) != 0) barf_perror("Could not chmod sockets"); if (listen(sock, 1) != 0) barf_perror("Could not listen on sockets"); } #endif static void usage(void) { fprintf(stderr, "Usage:\n" "\n" " xenstored \n" "\n" "where options may include:\n" "\n" " -D, --no-domain-init to state that xenstored should not initialise dom0,\n" " -F, --pid-file giving a file for the daemon's pid to be written,\n" " -H, --help to output this message,\n" " -N, --no-fork to request that the daemon does not fork,\n" " -P, --output-pid to request that the pid of the daemon is output,\n" " -T, --trace-file giving the file for logging, and\n" " --trace-control=+ activate a specific \n" " --trace-control=- deactivate a specific \n" " -E, --entry-nb limit the number of entries per domain,\n" " -S, --entry-size limit the size of entry per domain, and\n" " -W, --watch-nb limit the number of watches per domain,\n" " -t, --transaction limit the number of transaction allowed per domain,\n" " -A, --perm-nb limit the number of permissions per node,\n" " -M, --path-max limit the allowed Xenstore node path length,\n" " -Q, --quota = set the quota to the value , allowed\n" " quotas are:\n" " transaction-nodes: number of accessed node per\n" " transaction\n" " memory: total used memory per domain for nodes,\n" " transactions, watches and requests, above\n" " which Xenstore will stop talking to domain\n" " outstanding: number of outstanding requests\n" " -q, --quota-soft = set a soft quota to the value ,\n" " causing a warning to be issued via syslog() if the\n" " limit is violated, allowed quotas are:\n" " memory: see above\n" " -w, --timeout = set the timeout in seconds for ,\n" " allowed timeout candidates are:\n" " watch-event: time a watch-event is kept pending\n" " -R, --no-recovery to request that no recovery should be attempted when\n" " the store is corrupted (debug only),\n" " -I, --internal-db [on|off] store database in memory, not on disk, default is\n" " memory, with \"--internal-db off\" it is on disk\n" " -K, --keep-orphans don't delete nodes owned by a domain when the\n" " domain is deleted (this is a security risk!)\n" " -V, --verbose to request verbose execution.\n"); } static struct option options[] = { { "no-domain-init", 0, NULL, 'D' }, { "entry-nb", 1, NULL, 'E' }, { "pid-file", 1, NULL, 'F' }, { "event", 1, NULL, 'e' }, { "master-domid", 1, NULL, 'm' }, { "help", 0, NULL, 'H' }, { "no-fork", 0, NULL, 'N' }, { "priv-domid", 1, NULL, 'p' }, { "output-pid", 0, NULL, 'P' }, { "entry-size", 1, NULL, 'S' }, { "trace-file", 1, NULL, 'T' }, { "trace-control", 1, NULL, 1 }, { "transaction", 1, NULL, 't' }, { "perm-nb", 1, NULL, 'A' }, { "path-max", 1, NULL, 'M' }, { "quota", 1, NULL, 'Q' }, { "quota-soft", 1, NULL, 'q' }, { "timeout", 1, NULL, 'w' }, { "no-recovery", 0, NULL, 'R' }, { "internal-db", 2, NULL, 'I' }, { "keep-orphans", 0, NULL, 'K' }, { "verbose", 0, NULL, 'V' }, { "watch-nb", 1, NULL, 'W' }, #ifndef NO_LIVE_UPDATE { "live-update", 0, NULL, 'U' }, #endif { NULL, 0, NULL, 0 } }; int dom0_domid = 0; int dom0_event = 0; int priv_domid = 0; static int get_optval_int(const char *arg) { char *end; long val; val = strtol(arg, &end, 10); if (!*arg || *end || val < 0 || val > INT_MAX) barf("invalid parameter value \"%s\"\n", arg); return val; } static bool what_matches(const char *arg, const char *what) { unsigned int what_len = strlen(what); return !strncmp(arg, what, what_len) && arg[what_len] == '='; } static void set_timeout(const char *arg) { const char *eq = strchr(arg, '='); int val; if (!eq) barf("quotas must be specified via =\n"); val = get_optval_int(eq + 1); if (what_matches(arg, "watch-event")) timeout_watch_event_msec = val * 1000; else barf("unknown timeout \"%s\"\n", arg); } static void set_quota(const char *arg, bool soft) { const char *eq = strchr(arg, '='); int val; if (!eq) barf("quotas must be specified via =\n"); val = get_optval_int(eq + 1); if (what_matches(arg, "outstanding") && !soft) quota_req_outstanding = val; else if (what_matches(arg, "transaction-nodes") && !soft) quota_trans_nodes = val; else if (what_matches(arg, "memory")) { if (soft) quota_memory_per_domain_soft = val; else quota_memory_per_domain_hard = val; } else barf("unknown quota \"%s\"\n", arg); } /* Sorted by bit values of TRACE_* flags. Flag is (1u << index). */ const char *const trace_switches[] = { "obj", "io", "wrl", NULL }; int set_trace_switch(const char *arg) { bool remove = (arg[0] == '-'); unsigned int idx; switch (arg[0]) { case '-': remove = true; break; case '+': remove = false; break; default: return EINVAL; } arg++; for (idx = 0; trace_switches[idx]; idx++) { if (!strcmp(arg, trace_switches[idx])) { if (remove) trace_flags &= ~(1u << idx); else trace_flags |= 1u << idx; return 0; } } return EINVAL; } int main(int argc, char *argv[]) { int opt; int sock_pollfd_idx = -1; bool dofork = true; bool outputpid = false; bool no_domain_init = false; bool live_update = false; const char *pidfile = NULL; int timeout; orig_argc = argc; orig_argv = argv; while ((opt = getopt_long(argc, argv, "DE:F:HI::KNPS:t:A:M:Q:q:T:RVW:w:U", options, NULL)) != -1) { switch (opt) { case 'D': no_domain_init = true; break; case 'E': quota_nb_entry_per_domain = strtol(optarg, NULL, 10); break; case 'F': pidfile = optarg; break; case 'H': usage(); return 0; case 'N': dofork = false; break; case 'P': outputpid = true; break; case 'R': recovery = false; break; case 'S': quota_max_entry_size = strtol(optarg, NULL, 10); break; case 't': quota_max_transaction = strtol(optarg, NULL, 10); break; case 'T': tracefile = optarg; break; case 1: if (set_trace_switch(optarg)) barf("Illegal trace switch \"%s\"\n", optarg); break; case 'I': if (optarg && !strcmp(optarg, "off")) tdb_flags = 0; break; case 'K': keep_orphans = true; break; case 'V': verbose = true; break; case 'W': quota_nb_watch_per_domain = strtol(optarg, NULL, 10); break; case 'A': quota_nb_perms_per_node = strtol(optarg, NULL, 10); break; case 'M': quota_max_path_len = strtol(optarg, NULL, 10); quota_max_path_len = min(XENSTORE_REL_PATH_MAX, quota_max_path_len); break; case 'Q': set_quota(optarg, false); break; case 'q': set_quota(optarg, true); break; case 'w': set_timeout(optarg); break; case 'e': dom0_event = strtol(optarg, NULL, 10); break; case 'm': dom0_domid = strtol(optarg, NULL, 10); break; case 'p': priv_domid = strtol(optarg, NULL, 10); break; #ifndef NO_LIVE_UPDATE case 'U': live_update = true; break; #endif } } if (optind != argc) barf("%s: No arguments desired", argv[0]); reopen_log(); /* make sure xenstored directories exist */ /* Errors ignored here, will be reported when we open files */ mkdir(xs_daemon_rundir(), 0755); mkdir(xs_daemon_rootdir(), 0755); if (dofork) { openlog("xenstored", 0, LOG_DAEMON); if (!live_update) daemonize(); } if (pidfile) write_pidfile(pidfile); /* Talloc leak reports go to stderr, which is closed if we fork. */ if (!dofork) talloc_enable_leak_report_full(); /* Don't kill us with SIGPIPE. */ signal(SIGPIPE, SIG_IGN); talloc_enable_null_tracking(); #ifndef NO_SOCKETS if (!live_update) init_sockets(); #endif init_pipe(reopen_log_pipe); /* Listen to hypervisor. */ if (!no_domain_init && !live_update) { domain_init(-1); dom0_init(); } if (outputpid) { printf("%ld\n", (long)getpid()); fflush(stdout); } /* redirect to /dev/null now we're ready to accept connections */ if (dofork && !live_update) finish_daemonize(); #ifndef __MINIOS__ if (dofork) xprintf = trace; #endif signal(SIGHUP, trigger_reopen_log); if (tracefile) tracefile = talloc_strdup(NULL, tracefile); #ifndef NO_LIVE_UPDATE /* Read state in case of live update. */ if (live_update) lu_read_state(); #endif check_store(); /* Get ready to listen to the tools. */ initialize_fds(&sock_pollfd_idx, &timeout); #if defined(XEN_SYSTEMD_ENABLED) if (!live_update) { sd_notify(1, "READY=1"); fprintf(stderr, SD_NOTICE "xenstored is ready\n"); } #endif /* Main loop. */ for (;;) { struct connection *conn, *next; if (poll(fds, nr_fds, timeout) < 0) { if (errno == EINTR) continue; barf_perror("Poll failed"); } if (reopen_log_pipe0_pollfd_idx != -1) { if (fds[reopen_log_pipe0_pollfd_idx].revents & ~POLLIN) { close(reopen_log_pipe[0]); close(reopen_log_pipe[1]); init_pipe(reopen_log_pipe); } else if (fds[reopen_log_pipe0_pollfd_idx].revents & POLLIN) { char c; if (read(reopen_log_pipe[0], &c, 1) != 1) barf_perror("read failed"); reopen_log(); } reopen_log_pipe0_pollfd_idx = -1; } if (sock_pollfd_idx != -1) { if (fds[sock_pollfd_idx].revents & ~POLLIN) { barf_perror("sock poll failed"); break; } else if (fds[sock_pollfd_idx].revents & POLLIN) { accept_connection(sock); sock_pollfd_idx = -1; } } if (xce_pollfd_idx != -1) { if (fds[xce_pollfd_idx].revents & ~POLLIN) { barf_perror("xce_handle poll failed"); break; } else if (fds[xce_pollfd_idx].revents & POLLIN) { handle_event(); xce_pollfd_idx = -1; } } /* * list_for_each_entry_safe is not suitable here because * handle_input may delete entries besides the current one, but * those may be in the temporary next which would trigger a * use-after-free. list_for_each_entry_safe is only safe for * deleting the current entry. */ next = list_entry(connections.next, typeof(*conn), list); if (&next->list != &connections) talloc_increase_ref_count(next); while (&next->list != &connections) { conn = next; next = list_entry(conn->list.next, typeof(*conn), list); if (&next->list != &connections) talloc_increase_ref_count(next); if (conn_can_read(conn)) handle_input(conn); if (talloc_free(conn) == 0) continue; talloc_increase_ref_count(conn); if (conn_can_write(conn)) handle_output(conn); if (talloc_free(conn) == 0) continue; conn->pollfd_idx = -1; } if (delayed_requests) { list_for_each_entry(conn, &connections, list) { struct delayed_request *req, *tmp; list_for_each_entry_safe(req, tmp, &conn->delayed, list) call_delayed(req); } } initialize_fds(&sock_pollfd_idx, &timeout); } } const char *dump_state_global(FILE *fp) { struct xs_state_record_header head; struct xs_state_global glb; head.type = XS_STATE_TYPE_GLOBAL; head.length = sizeof(glb); if (fwrite(&head, sizeof(head), 1, fp) != 1) return "Dump global state error"; glb.socket_fd = sock; glb.evtchn_fd = xenevtchn_fd(xce_handle); if (fwrite(&glb, sizeof(glb), 1, fp) != 1) return "Dump global state error"; return NULL; } static const char *dump_input_buffered_data(FILE *fp, const struct buffered_data *in, unsigned int *total_len) { unsigned int hlen = in->inhdr ? in->used : sizeof(in->hdr); *total_len += hlen; if (fp && fwrite(&in->hdr, hlen, 1, fp) != 1) return "Dump read data error"; if (!in->inhdr && in->used) { *total_len += in->used; if (fp && fwrite(in->buffer, in->used, 1, fp) != 1) return "Dump read data error"; } return NULL; } /* Called twice: first with fp == NULL to get length, then for writing data. */ const char *dump_state_buffered_data(FILE *fp, const struct connection *c, struct xs_state_connection *sc) { unsigned int len = 0, used; struct buffered_data *out; bool partial = true; struct delayed_request *req; const char *ret; /* Dump any command that was delayed */ list_for_each_entry(req, &c->delayed, list) { /* * We only want to preserve commands that weren't processed at * all. All the other delayed requests (such as do_lu_start()) * must be processed before Live-Update. */ if (req->func != process_delayed_message) continue; assert(!req->in->inhdr); if ((ret = dump_input_buffered_data(fp, req->in, &len))) return ret; } if (c->in && (ret = dump_input_buffered_data(fp, c->in, &len))) return ret; if (sc) { sc->data_in_len = len; sc->data_resp_len = 0; } len = 0; list_for_each_entry(out, &c->out_list, list) { used = out->used; if (out->inhdr) { if (!used) partial = false; if (fp && fwrite(out->hdr.raw + out->used, sizeof(out->hdr) - out->used, 1, fp) != 1) return "Dump buffered data error"; len += sizeof(out->hdr) - out->used; used = 0; } if (fp && out->hdr.msg.len && fwrite(out->buffer + used, out->hdr.msg.len - used, 1, fp) != 1) return "Dump buffered data error"; len += out->hdr.msg.len - used; if (partial && sc) sc->data_resp_len = len; partial = false; } /* Add "OK" for live-update command. */ if (c == lu_get_connection()) { unsigned int rc = lu_write_response(fp); if (!rc) return "Dump buffered data error"; len += rc; } if (sc) sc->data_out_len = len; return NULL; } const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, unsigned int n_perms) { unsigned int p; for (p = 0; p < n_perms; p++) { struct xs_state_node_perm sp; switch ((int)perms[p].perms & ~XS_PERM_IGNORE) { case XS_PERM_READ: sp.access = XS_STATE_NODE_PERM_READ; break; case XS_PERM_WRITE: sp.access = XS_STATE_NODE_PERM_WRITE; break; case XS_PERM_READ | XS_PERM_WRITE: sp.access = XS_STATE_NODE_PERM_BOTH; break; default: sp.access = XS_STATE_NODE_PERM_NONE; break; } sp.flags = (perms[p].perms & XS_PERM_IGNORE) ? XS_STATE_NODE_PERM_IGNORE : 0; sp.domid = perms[p].id; if (fwrite(&sp, sizeof(sp), 1, fp) != 1) return "Dump node permission error"; } return NULL; } struct dump_node_data { FILE *fp; const char *err; }; static int dump_state_node_err(struct dump_node_data *data, const char *err) { data->err = err; return WALK_TREE_ERROR_STOP; } static int dump_state_node(const void *ctx, struct connection *conn, struct node *node, void *arg) { struct dump_node_data *data = arg; FILE *fp = data->fp; unsigned int pathlen; struct xs_state_record_header head; struct xs_state_node sn; const char *ret; pathlen = strlen(node->name) + 1; head.type = XS_STATE_TYPE_NODE; head.length = sizeof(sn); sn.conn_id = 0; sn.ta_id = 0; sn.ta_access = 0; sn.perm_n = node->perms.num; sn.path_len = pathlen; sn.data_len = node->datalen; head.length += node->perms.num * sizeof(*sn.perms); head.length += pathlen; head.length += node->datalen; head.length = ROUNDUP(head.length, 3); if (fwrite(&head, sizeof(head), 1, fp) != 1) return dump_state_node_err(data, "Dump node head error"); if (fwrite(&sn, sizeof(sn), 1, fp) != 1) return dump_state_node_err(data, "Dump node state error"); ret = dump_state_node_perms(fp, node->perms.p, node->perms.num); if (ret) return dump_state_node_err(data, ret); if (fwrite(node->name, pathlen, 1, fp) != 1) return dump_state_node_err(data, "Dump node path error"); if (node->datalen && fwrite(node->data, node->datalen, 1, fp) != 1) return dump_state_node_err(data, "Dump node data error"); ret = dump_state_align(fp); if (ret) return dump_state_node_err(data, ret); return WALK_TREE_OK; } static int dump_state_special_node(FILE *fp, const void *ctx, struct dump_node_data *data, const char *name) { struct node *node; int ret; node = read_node(NULL, ctx, name); if (!node) return dump_state_node_err(data, "Dump node read node error"); ret = dump_state_node(ctx, NULL, node, data); talloc_free(node); return ret; } const char *dump_state_nodes(FILE *fp, const void *ctx) { struct dump_node_data data = { .fp = fp, .err = "Dump node walk error" }; struct walk_funcs walkfuncs = { .enter = dump_state_node }; if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) return data.err; if (dump_state_special_node(fp, ctx, &data, "@releaseDomain")) return data.err; if (dump_state_special_node(fp, ctx, &data, "@introduceDomain")) return data.err; return NULL; } void read_state_global(const void *ctx, const void *state) { const struct xs_state_global *glb = state; sock = glb->socket_fd; domain_init(glb->evtchn_fd); } static void add_buffered_data(struct buffered_data *bdata, struct connection *conn, const uint8_t *data, unsigned int len) { bdata->hdr.msg.len = len; if (len <= DEFAULT_BUFFER_SIZE) bdata->buffer = bdata->default_buffer; else bdata->buffer = talloc_array(bdata, char, len); if (!bdata->buffer) barf("error restoring buffered data"); memcpy(bdata->buffer, data, len); if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec && domain_is_unprivileged(conn)) { bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; if (!conn->timeout_msec) conn->timeout_msec = bdata->timeout_msec; } /* Queue for later transmission. */ list_add_tail(&bdata->list, &conn->out_list); bdata->on_out_list = true; /* * Watch events are never "outstanding", but the request causing them * are instead kept "outstanding" until all watch events caused by that * request have been delivered. */ if (bdata->hdr.msg.type != XS_WATCH_EVENT) domain_outstanding_inc(conn); /* * We are restoring the state after Live-Update and the new quota may * be smaller. So ignore it. The limit will be applied for any resource * after the state has been fully restored. */ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); } void read_state_buffered_data(const void *ctx, struct connection *conn, const struct xs_state_connection *sc) { struct buffered_data *bdata; const uint8_t *data; unsigned int len; bool partial = sc->data_resp_len; for (data = sc->data; data < sc->data + sc->data_in_len; data += len) { bdata = new_buffer(conn); if (!bdata) barf("error restoring read data"); /* * We don't know yet if there is more than one message * to process. So the len is the size of the leftover data. */ len = sc->data_in_len - (data - sc->data); if (len < sizeof(bdata->hdr)) { bdata->inhdr = true; memcpy(&bdata->hdr, data, len); bdata->used = len; } else { bdata->inhdr = false; memcpy(&bdata->hdr, data, sizeof(bdata->hdr)); if (bdata->hdr.msg.len <= DEFAULT_BUFFER_SIZE) bdata->buffer = bdata->default_buffer; else bdata->buffer = talloc_array(bdata, char, bdata->hdr.msg.len); if (!bdata->buffer) barf("Error allocating in buffer"); bdata->used = min_t(unsigned int, len - sizeof(bdata->hdr), bdata->hdr.msg.len); memcpy(bdata->buffer, data + sizeof(bdata->hdr), bdata->used); /* Update len to match the size of the message. */ len = bdata->used + sizeof(bdata->hdr); } /* * If the message is not complete, then it means this was * the current processed message. All the other messages * will be queued to be handled after restoring. */ if (bdata->inhdr || bdata->used != bdata->hdr.msg.len) { assert(conn->in == NULL); conn->in = bdata; } else if (delay_request(conn, bdata, process_delayed_message, conn, true)) barf("Unable to delay the request"); } for (data = sc->data + sc->data_in_len; data < sc->data + sc->data_in_len + sc->data_out_len; data += len) { bdata = new_buffer(conn); if (!bdata) barf("error restoring buffered data"); if (partial) { bdata->inhdr = false; /* Make trace look nice. */ bdata->hdr.msg.type = XS_INVALID; len = sc->data_resp_len; add_buffered_data(bdata, conn, data, len); partial = false; continue; } memcpy(&bdata->hdr, data, sizeof(bdata->hdr)); data += sizeof(bdata->hdr); len = bdata->hdr.msg.len; add_buffered_data(bdata, conn, data, len); } } void read_state_node(const void *ctx, const void *state) { const struct xs_state_node *sn = state; struct node *node, *parent; TDB_DATA key; char *name, *parentname; unsigned int i; struct connection conn = { .id = priv_domid }; name = (char *)(sn->perms + sn->perm_n); node = talloc(ctx, struct node); if (!node) barf("allocation error restoring node"); node->acc.memory = 0; node->name = name; node->generation = ++generation; node->datalen = sn->data_len; node->data = name + sn->path_len; node->childlen = 0; node->children = NULL; node->perms.num = sn->perm_n; node->perms.p = talloc_array(node, struct xs_permissions, node->perms.num); if (!node->perms.p) barf("allocation error restoring node"); for (i = 0; i < node->perms.num; i++) { switch (sn->perms[i].access) { case 'r': node->perms.p[i].perms = XS_PERM_READ; break; case 'w': node->perms.p[i].perms = XS_PERM_WRITE; break; case 'b': node->perms.p[i].perms = XS_PERM_READ | XS_PERM_WRITE; break; default: node->perms.p[i].perms = XS_PERM_NONE; break; } if (sn->perms[i].flags & XS_STATE_NODE_PERM_IGNORE) node->perms.p[i].perms |= XS_PERM_IGNORE; node->perms.p[i].id = sn->perms[i].domid; } if (!strstarts(name, "@")) { parentname = get_parent(node, name); if (!parentname) barf("allocation error restoring node"); parent = read_node(NULL, node, parentname); if (!parent) barf("read parent error restoring node"); if (add_child(node, parent, name)) barf("allocation error restoring node"); set_tdb_key(parentname, &key); if (write_node_raw(NULL, &key, parent, true)) barf("write parent error restoring node"); } set_tdb_key(name, &key); if (write_node_raw(NULL, &key, node, true)) barf("write node error restoring node"); if (domain_nbentry_inc(&conn, get_node_owner(node))) barf("node accounting error restoring node"); talloc_free(node); } /* * Local variables: * mode: C * c-file-style: "linux" * indent-tabs-mode: t * c-basic-offset: 8 * tab-width: 8 * End: */