/* Domain communications for Xen Store Daemon. Copyright (C) 2005 Rusty Russell IBM Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; If not, see . */ #include #include #include #include #include #include #include #include #include "utils.h" #include "talloc.h" #include "xenstored_core.h" #include "xenstored_domain.h" #include "xenstored_transaction.h" #include "xenstored_watch.h" #include "xenstored_control.h" #include #include #include static xc_interface **xc_handle; xengnttab_handle **xgt_handle; static evtchn_port_t virq_port; xenevtchn_handle *xce_handle = NULL; struct domain { /* The id of this domain */ unsigned int domid; /* Event channel port */ evtchn_port_t port; /* Domain path in store. */ char *path; /* Shared page. */ struct xenstore_domain_interface *interface; /* The connection associated with this. */ struct connection *conn; /* Generation count at domain introduction time. */ uint64_t generation; /* Have we noticed that this domain is shutdown? */ bool shutdown; /* Has domain been officially introduced? */ bool introduced; /* number of entry from this domain in the store */ int nbentry; /* Amount of memory allocated for this domain. */ int memory; bool soft_quota_reported; bool hard_quota_reported; time_t mem_last_msg; #define MEM_WARN_MINTIME_SEC 10 /* number of watch for this domain */ int nbwatch; /* Number of outstanding requests. */ int nboutstanding; /* write rate limit */ wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */ struct wrl_timestampt wrl_timestamp; bool wrl_delay_logged; }; struct changed_domain { /* List of all changed domains. */ struct list_head list; /* Identifier of the changed domain. */ unsigned int domid; /* Amount by which this domain's nbentry field has changed. */ int nbentry; }; static struct hashtable *domhash; static bool check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) { return ((prod - cons) <= XENSTORE_RING_SIZE); } static void *get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) *len = XENSTORE_RING_SIZE - (prod - cons); return buf + MASK_XENSTORE_IDX(prod); } static const void *get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, const char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); if ((prod - cons) < *len) *len = prod - cons; return buf + MASK_XENSTORE_IDX(cons); } static int writechn(struct connection *conn, const void *data, unsigned int len) { uint32_t avail; void *dest; struct xenstore_domain_interface *intf = conn->domain->interface; XENSTORE_RING_IDX cons, prod; /* Must read indexes once, and before anything else, and verified. */ cons = intf->rsp_cons; prod = intf->rsp_prod; xen_mb(); if (!check_indexes(cons, prod)) { errno = EIO; return -1; } dest = get_output_chunk(cons, prod, intf->rsp, &avail); if (avail < len) len = avail; memcpy(dest, data, len); xen_mb(); intf->rsp_prod += len; xenevtchn_notify(xce_handle, conn->domain->port); return len; } static int readchn(struct connection *conn, void *data, unsigned int len) { uint32_t avail; const void *src; struct xenstore_domain_interface *intf = conn->domain->interface; XENSTORE_RING_IDX cons, prod; /* Must read indexes once, and before anything else, and verified. */ cons = intf->req_cons; prod = intf->req_prod; xen_mb(); if (!check_indexes(cons, prod)) { errno = EIO; return -1; } src = get_input_chunk(cons, prod, intf->req, &avail); if (avail < len) len = avail; memcpy(data, src, len); xen_mb(); intf->req_cons += len; xenevtchn_notify(xce_handle, conn->domain->port); return len; } static bool domain_can_write(struct connection *conn) { struct xenstore_domain_interface *intf = conn->domain->interface; return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE); } static bool domain_can_read(struct connection *conn) { struct xenstore_domain_interface *intf = conn->domain->interface; if (domain_is_unprivileged(conn)) { if (conn->domain->wrl_credit < 0) return false; if (conn->domain->nboutstanding >= quota_req_outstanding) return false; if (conn->domain->memory >= quota_memory_per_domain_hard && quota_memory_per_domain_hard) return false; } return (intf->req_cons != intf->req_prod); } static const struct interface_funcs domain_funcs = { .write = writechn, .read = readchn, .can_write = domain_can_write, .can_read = domain_can_read, }; static void *map_interface(domid_t domid) { return xengnttab_map_grant_ref(*xgt_handle, domid, GNTTAB_RESERVED_XENSTORE, PROT_READ|PROT_WRITE); } static void unmap_interface(void *interface) { xengnttab_unmap(*xgt_handle, interface, 1); } static int domain_tree_remove_sub(const void *ctx, struct connection *conn, struct node *node, void *arg) { struct domain *domain = arg; TDB_DATA key; int ret = WALK_TREE_OK; if (node->perms.p[0].id != domain->domid) return WALK_TREE_OK; if (keep_orphans) { set_tdb_key(node->name, &key); domain->nbentry--; node->perms.p[0].id = priv_domid; node->acc.memory = 0; domain_nbentry_inc(NULL, priv_domid); if (write_node_raw(NULL, &key, node, true)) { /* That's unfortunate. We only can try to continue. */ syslog(LOG_ERR, "error when moving orphaned node %s to dom0\n", node->name); } else trace("orphaned node %s moved to dom0\n", node->name); } else { if (rm_node(NULL, ctx, node->name)) { /* That's unfortunate. We only can try to continue. */ syslog(LOG_ERR, "error when deleting orphaned node %s\n", node->name); } else trace("orphaned node %s deleted\n", node->name); /* Skip children in all cases in order to avoid more errors. */ ret = WALK_TREE_SKIP_CHILDREN; } return domain->nbentry > 0 ? ret : WALK_TREE_SUCCESS_STOP; } static void domain_tree_remove(struct domain *domain) { int ret; struct walk_funcs walkfuncs = { .enter = domain_tree_remove_sub }; if (domain->nbentry > 0) { ret = walk_node_tree(domain, NULL, "/", &walkfuncs, domain); if (ret == WALK_TREE_ERROR_STOP) syslog(LOG_ERR, "error when looking for orphaned nodes\n"); } walk_node_tree(domain, NULL, "@releaseDomain", &walkfuncs, domain); walk_node_tree(domain, NULL, "@introduceDomain", &walkfuncs, domain); } static void fire_special_watches(const char *name) { void *ctx = talloc_new(NULL); struct node *node; if (!ctx) return; node = read_node(NULL, ctx, name); if (node) fire_watches(NULL, ctx, name, node, true, NULL); else log("special node %s not found\n", name); talloc_free(ctx); } static int destroy_domain(void *_domain) { struct domain *domain = _domain; domain_tree_remove(domain); hashtable_remove(domhash, &domain->domid); if (!domain->introduced) return 0; if (domain->port) { if (xenevtchn_unbind(xce_handle, domain->port) == -1) eprintf("> Unbinding port %i failed!\n", domain->port); } if (domain->interface) { /* Domain 0 was mapped by dom0_init, so it must be unmapped using munmap() and not the grant unmap call. */ if (domain->domid == dom0_domid) unmap_xenbus(domain->interface); else unmap_interface(domain->interface); } fire_special_watches("@releaseDomain"); wrl_domain_destroy(domain); return 0; } static bool get_domain_info(unsigned int domid, xc_domaininfo_t *dominfo) { return xc_domain_getinfo_single(*xc_handle, domid, dominfo) == 0; } static int check_domain(const void *k, void *v, void *arg) { xc_domaininfo_t dominfo; struct connection *conn; bool dom_valid; struct domain *domain = v; bool *notify = arg; dom_valid = get_domain_info(domain->domid, &dominfo); if (!domain->introduced) { if (!dom_valid) talloc_free(domain); return 0; } if (dom_valid) { if ((dominfo.flags & XEN_DOMINF_shutdown) && !domain->shutdown) { domain->shutdown = true; *notify = true; } if (!(dominfo.flags & XEN_DOMINF_dying)) return 0; } if (domain->conn) { /* domain is a talloc child of domain->conn. */ conn = domain->conn; domain->conn = NULL; talloc_unlink(talloc_autofree_context(), conn); *notify = false; /* destroy_domain() fires the watch */ /* Above unlink might result in 2 domains being freed! */ return 1; } return 0; } void check_domains(void) { bool notify = false; while (hashtable_iterate(domhash, check_domain, ¬ify)) ; if (notify) fire_special_watches("@releaseDomain"); } /* We scan all domains rather than use the information given here. */ void handle_event(void) { evtchn_port_t port; if ((port = xenevtchn_pending(xce_handle)) == -1) barf_perror("Failed to read from event fd"); if (port == virq_port) check_domains(); if (xenevtchn_unmask(xce_handle, port) == -1) barf_perror("Failed to write to event fd"); } static char *talloc_domain_path(const void *context, unsigned int domid) { return talloc_asprintf(context, "/local/domain/%u", domid); } static struct domain *find_domain_struct(unsigned int domid) { return hashtable_search(domhash, &domid); } int domain_get_quota(const void *ctx, struct connection *conn, unsigned int domid) { struct domain *d = find_domain_struct(domid); char *resp; int ta; if (!d) return ENOENT; ta = d->conn ? d->conn->transaction_started : 0; resp = talloc_asprintf(ctx, "Domain %u:\n", domid); if (!resp) return ENOMEM; #define ent(t, e) \ resp = talloc_asprintf_append(resp, "%-16s: %8d\n", #t, e); \ if (!resp) return ENOMEM ent(nodes, d->nbentry); ent(watches, d->nbwatch); ent(transactions, ta); ent(outstanding, d->nboutstanding); ent(memory, d->memory); #undef ent send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); return 0; } static struct domain *alloc_domain(const void *context, unsigned int domid) { struct domain *domain; domain = talloc_zero(context, struct domain); if (!domain) { errno = ENOMEM; return NULL; } domain->domid = domid; domain->generation = generation; domain->introduced = false; if (!hashtable_insert(domhash, &domain->domid, domain)) { talloc_free(domain); errno = ENOMEM; return NULL; } talloc_set_destructor(domain, destroy_domain); return domain; } static struct domain *find_or_alloc_domain(const void *ctx, unsigned int domid) { struct domain *domain; domain = find_domain_struct(domid); return domain ? : alloc_domain(ctx, domid); } static struct domain *find_or_alloc_existing_domain(unsigned int domid) { struct domain *domain; xc_domaininfo_t dominfo; domain = find_domain_struct(domid); if (!domain && get_domain_info(domid, &dominfo)) domain = alloc_domain(NULL, domid); return domain; } static int new_domain(struct domain *domain, int port, bool restore) { int rc; domain->port = 0; domain->shutdown = false; domain->path = talloc_domain_path(domain, domain->domid); if (!domain->path) { errno = ENOMEM; return errno; } wrl_domain_new(domain); if (restore) domain->port = port; else { /* Tell kernel we're interested in this event. */ rc = xenevtchn_bind_interdomain(xce_handle, domain->domid, port); if (rc == -1) return errno; domain->port = rc; } domain->introduced = true; domain->conn = new_connection(&domain_funcs); if (!domain->conn) { errno = ENOMEM; return errno; } domain->conn->domain = domain; domain->conn->id = domain->domid; return 0; } static struct domain *find_domain_by_domid(unsigned int domid) { struct domain *d; d = find_domain_struct(domid); return (d && d->introduced) ? d : NULL; } int acc_fix_domains(struct list_head *head, bool chk_quota, bool update) { struct changed_domain *cd; int cnt; list_for_each_entry(cd, head, list) { cnt = domain_nbentry_fix(cd->domid, cd->nbentry, update); if (!update) { if (chk_quota && cnt >= quota_nb_entry_per_domain) return ENOSPC; if (cnt < 0) return ENOMEM; } } return 0; } static struct changed_domain *acc_find_changed_domain(struct list_head *head, unsigned int domid) { struct changed_domain *cd; list_for_each_entry(cd, head, list) { if (cd->domid == domid) return cd; } return NULL; } static struct changed_domain *acc_get_changed_domain(const void *ctx, struct list_head *head, unsigned int domid) { struct changed_domain *cd; cd = acc_find_changed_domain(head, domid); if (cd) return cd; cd = talloc_zero(ctx, struct changed_domain); if (!cd) return NULL; cd->domid = domid; list_add_tail(&cd->list, head); return cd; } static int acc_add_dom_nbentry(const void *ctx, struct list_head *head, int val, unsigned int domid) { struct changed_domain *cd; cd = acc_get_changed_domain(ctx, head, domid); if (!cd) return 0; errno = 0; cd->nbentry += val; return cd->nbentry; } static void domain_conn_reset(struct domain *domain) { struct connection *conn = domain->conn; conn_delete_all_watches(conn); conn_delete_all_transactions(conn); conn_free_buffered_data(conn); talloc_free(conn->in); domain->interface->req_cons = domain->interface->req_prod = 0; domain->interface->rsp_cons = domain->interface->rsp_prod = 0; } /* * Keep the connection alive but stop processing any new request or sending * reponse. This is to allow sending @releaseDomain watch event at the correct * moment and/or to allow the connection to restart (not yet implemented). * * All watches, transactions, buffers will be freed. */ void ignore_connection(struct connection *conn, unsigned int err) { trace("CONN %p ignored, reason %u\n", conn, err); if (conn->domain && conn->domain->interface) conn->domain->interface->error = err; conn->is_ignored = true; conn_delete_all_watches(conn); conn_delete_all_transactions(conn); conn_free_buffered_data(conn); talloc_free(conn->in); conn->in = NULL; /* if this is a socket connection, drop it now */ if (conn->fd >= 0) talloc_free(conn); } static struct domain *introduce_domain(const void *ctx, unsigned int domid, evtchn_port_t port, bool restore) { struct domain *domain; int rc; struct xenstore_domain_interface *interface; bool is_master_domain = (domid == xenbus_master_domid()); domain = find_or_alloc_domain(ctx, domid); if (!domain) return NULL; if (!domain->introduced) { interface = is_master_domain ? xenbus_map() : map_interface(domid); if (!interface && !restore) return NULL; if (new_domain(domain, port, restore)) { rc = errno; if (interface) { if (is_master_domain) unmap_xenbus(interface); else unmap_interface(interface); } errno = rc; return NULL; } domain->interface = interface; if (is_master_domain) setup_structure(restore); /* Now domain belongs to its connection. */ talloc_steal(domain->conn, domain); if (!restore) { /* Notify the domain that xenstore is available */ interface->connection = XENSTORE_CONNECTED; xenevtchn_notify(xce_handle, domain->port); } if (!is_master_domain && !restore) fire_special_watches("@introduceDomain"); } else { /* Use XS_INTRODUCE for recreating the xenbus event-channel. */ if (domain->port) xenevtchn_unbind(xce_handle, domain->port); rc = xenevtchn_bind_interdomain(xce_handle, domid, port); domain->port = (rc == -1) ? 0 : rc; } return domain; } /* domid, gfn, evtchn, path */ int do_introduce(const void *ctx, struct connection *conn, struct buffered_data *in) { struct domain *domain; char *vec[3]; unsigned int domid; evtchn_port_t port; if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) return EINVAL; domid = atoi(vec[0]); /* Ignore the gfn, we don't need it. */ port = atoi(vec[2]); /* Sanity check args. */ if (port <= 0) return EINVAL; domain = introduce_domain(ctx, domid, port, false); if (!domain) return errno; domain_conn_reset(domain); send_ack(conn, XS_INTRODUCE); return 0; } static struct domain *find_connected_domain(unsigned int domid) { struct domain *domain; domain = find_domain_by_domid(domid); if (!domain) return ERR_PTR(-ENOENT); if (!domain->conn) return ERR_PTR(-EINVAL); return domain; } int do_set_target(const void *ctx, struct connection *conn, struct buffered_data *in) { char *vec[2]; unsigned int domid, tdomid; struct domain *domain, *tdomain; if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) return EINVAL; domid = atoi(vec[0]); tdomid = atoi(vec[1]); domain = find_connected_domain(domid); if (IS_ERR(domain)) return -PTR_ERR(domain); tdomain = find_connected_domain(tdomid); if (IS_ERR(tdomain)) return -PTR_ERR(tdomain); talloc_reference(domain->conn, tdomain->conn); domain->conn->target = tdomain->conn; send_ack(conn, XS_SET_TARGET); return 0; } static struct domain *onearg_domain(struct connection *conn, struct buffered_data *in) { const char *domid_str = onearg(in); unsigned int domid; if (!domid_str) return ERR_PTR(-EINVAL); domid = atoi(domid_str); if (domid == dom0_domid) return ERR_PTR(-EINVAL); return find_connected_domain(domid); } /* domid */ int do_release(const void *ctx, struct connection *conn, struct buffered_data *in) { struct domain *domain; domain = onearg_domain(conn, in); if (IS_ERR(domain)) return -PTR_ERR(domain); /* Avoid triggering watch events when the domain's nodes are deleted. */ conn_delete_all_watches(domain->conn); talloc_free(domain->conn); send_ack(conn, XS_RELEASE); return 0; } int do_resume(const void *ctx, struct connection *conn, struct buffered_data *in) { struct domain *domain; domain = onearg_domain(conn, in); if (IS_ERR(domain)) return -PTR_ERR(domain); domain->shutdown = false; send_ack(conn, XS_RESUME); return 0; } int do_get_domain_path(const void *ctx, struct connection *conn, struct buffered_data *in) { char *path; const char *domid_str = onearg(in); if (!domid_str) return EINVAL; path = talloc_domain_path(ctx, atoi(domid_str)); if (!path) return errno; send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1); return 0; } int do_is_domain_introduced(const void *ctx, struct connection *conn, struct buffered_data *in) { int result; unsigned int domid; const char *domid_str = onearg(in); if (!domid_str) return EINVAL; domid = atoi(domid_str); if (domid == DOMID_SELF) result = 1; else result = (find_domain_by_domid(domid) != NULL); send_reply(conn, XS_IS_DOMAIN_INTRODUCED, result ? "T" : "F", 2); return 0; } /* Allow guest to reset all watches */ int do_reset_watches(const void *ctx, struct connection *conn, struct buffered_data *in) { conn_delete_all_watches(conn); conn_delete_all_transactions(conn); send_ack(conn, XS_RESET_WATCHES); return 0; } static int close_xc_handle(void *_handle) { xc_interface_close(*(xc_interface**)_handle); return 0; } static int close_xgt_handle(void *_handle) { xengnttab_close(*(xengnttab_handle **)_handle); return 0; } /* Returns the implicit path of a connection (only domains have this) */ const char *get_implicit_path(const struct connection *conn) { if (!conn->domain) return "/local/domain/0"; return conn->domain->path; } void dom0_init(void) { evtchn_port_t port; struct domain *dom0; port = xenbus_evtchn(); if (port == -1) barf_perror("Failed to initialize dom0 port"); dom0 = introduce_domain(NULL, xenbus_master_domid(), port, false); if (!dom0) barf_perror("Failed to initialize dom0"); xenevtchn_notify(xce_handle, dom0->port); } static unsigned int domhash_fn(const void *k) { return *(const unsigned int *)k; } static int domeq_fn(const void *key1, const void *key2) { return *(const unsigned int *)key1 == *(const unsigned int *)key2; } void domain_init(int evtfd) { int rc; /* Start with a random rather low domain count for the hashtable. */ domhash = create_hashtable(NULL, 8, domhash_fn, domeq_fn, 0); if (!domhash) barf_perror("Failed to allocate domain hashtable"); xc_handle = talloc(talloc_autofree_context(), xc_interface*); if (!xc_handle) barf_perror("Failed to allocate domain handle"); *xc_handle = xc_interface_open(0,0,0); if (!*xc_handle) barf_perror("Failed to open connection to hypervisor"); talloc_set_destructor(xc_handle, close_xc_handle); xgt_handle = talloc(talloc_autofree_context(), xengnttab_handle*); if (!xgt_handle) barf_perror("Failed to allocate domain gnttab handle"); *xgt_handle = xengnttab_open(NULL, 0); if (*xgt_handle == NULL) barf_perror("Failed to open connection to gnttab"); /* * Allow max number of domains for mappings. We allow one grant per * domain so the theoretical maximum is DOMID_FIRST_RESERVED. */ xengnttab_set_max_grants(*xgt_handle, DOMID_FIRST_RESERVED); talloc_set_destructor(xgt_handle, close_xgt_handle); if (evtfd < 0) xce_handle = xenevtchn_open(NULL, XENEVTCHN_NO_CLOEXEC); else xce_handle = xenevtchn_fdopen(NULL, evtfd, 0); if (xce_handle == NULL) barf_perror("Failed to open evtchn device"); if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_DOM_EXC)) == -1) barf_perror("Failed to bind to domain exception virq port"); virq_port = rc; } void domain_deinit(void) { if (virq_port) xenevtchn_unbind(xce_handle, virq_port); } /* * Check whether a domain was created before or after a specific generation * count (used for testing whether a node permission is older than a domain). * * Return values: * false: domain has higher generation count (it is younger than a node with * the given count), or domain isn't existing any longer * true: domain is older than the node */ static bool chk_domain_generation(unsigned int domid, uint64_t gen) { struct domain *d; if (!xc_handle && domid == dom0_domid) return true; d = find_domain_struct(domid); return d && d->generation <= gen; } /* * Allocate all missing struct domain referenced by a permission set. * Any permission entries for not existing domains will be marked to be * ignored. */ int domain_alloc_permrefs(struct node_perms *perms) { unsigned int i, domid; struct domain *d; xc_domaininfo_t dominfo; for (i = 0; i < perms->num; i++) { domid = perms->p[i].id; d = find_domain_struct(domid); if (!d) { if (!get_domain_info(domid, &dominfo)) perms->p[i].perms |= XS_PERM_IGNORE; else if (!alloc_domain(NULL, domid)) return ENOMEM; } } return 0; } /* * Remove permissions for no longer existing domains in order to avoid a new * domain with the same domid inheriting the permissions. */ int domain_adjust_node_perms(struct node *node) { unsigned int i; for (i = 1; i < node->perms.num; i++) { if (node->perms.p[i].perms & XS_PERM_IGNORE) continue; if (!chk_domain_generation(node->perms.p[i].id, node->generation)) node->perms.p[i].perms |= XS_PERM_IGNORE; } return 0; } static int domain_nbentry_add(struct connection *conn, unsigned int domid, int add, bool no_dom_alloc) { struct domain *d; struct list_head *head; int ret; if (conn && domid == conn->id && conn->domain) d = conn->domain; else if (no_dom_alloc) { d = find_domain_struct(domid); if (!d) { errno = ENOENT; corrupt(conn, "Missing domain %u\n", domid); return -1; } } else { d = find_or_alloc_existing_domain(domid); if (!d) { errno = ENOMEM; return -1; } } if (conn && conn->transaction) { head = transaction_get_changed_domains(conn->transaction); ret = acc_add_dom_nbentry(conn->transaction, head, add, domid); if (errno) { fail_transaction(conn->transaction); return -1; } /* * In a transaction when a node is being added/removed AND the * same node has been added/removed outside the transaction in * parallel, the resulting number of nodes will be wrong. This * is no problem, as the transaction will fail due to the * resulting conflict. * In the node remove case the resulting number can be even * negative, which should be avoided. */ return max(d->nbentry + ret, 0); } d->nbentry += add; return d->nbentry; } int domain_nbentry_inc(struct connection *conn, unsigned int domid) { return (domain_nbentry_add(conn, domid, 1, false) < 0) ? errno : 0; } int domain_nbentry_dec(struct connection *conn, unsigned int domid) { return (domain_nbentry_add(conn, domid, -1, true) < 0) ? errno : 0; } int domain_nbentry_fix(unsigned int domid, int num, bool update) { int ret; ret = domain_nbentry_add(NULL, domid, update ? num : 0, update); if (ret < 0 || update) return ret; return domid_is_unprivileged(domid) ? ret + num : 0; } int domain_nbentry(struct connection *conn) { return (domain_is_unprivileged(conn)) ? conn->domain->nbentry : 0; } static bool domain_chk_quota(struct domain *domain, int mem) { time_t now; if (!domain || !domid_is_unprivileged(domain->domid) || (domain->conn && domain->conn->is_ignored)) return false; now = time(NULL); if (mem >= quota_memory_per_domain_hard && quota_memory_per_domain_hard) { if (domain->hard_quota_reported) return true; syslog(LOG_ERR, "Domain %u exceeds hard memory quota, Xenstore interface to domain stalled\n", domain->domid); domain->mem_last_msg = now; domain->hard_quota_reported = true; return true; } if (now - domain->mem_last_msg >= MEM_WARN_MINTIME_SEC) { if (domain->hard_quota_reported) { domain->mem_last_msg = now; domain->hard_quota_reported = false; syslog(LOG_INFO, "Domain %u below hard memory quota again\n", domain->domid); } if (mem >= quota_memory_per_domain_soft && quota_memory_per_domain_soft && !domain->soft_quota_reported) { domain->mem_last_msg = now; domain->soft_quota_reported = true; syslog(LOG_WARNING, "Domain %u exceeds soft memory quota\n", domain->domid); } if (mem < quota_memory_per_domain_soft && domain->soft_quota_reported) { domain->mem_last_msg = now; domain->soft_quota_reported = false; syslog(LOG_INFO, "Domain %u below soft memory quota again\n", domain->domid); } } return false; } int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) { struct domain *domain; domain = find_domain_struct(domid); if (domain) { /* * domain_chk_quota() will print warning and also store whether * the soft/hard quota has been hit. So check no_quota_check * *after*. */ if (domain_chk_quota(domain, domain->memory + mem) && !no_quota_check) return ENOMEM; domain->memory += mem; } else { /* * The domain the memory is to be accounted for should always * exist, as accounting is done either for a domain related to * the current connection, or for the domain owning a node * (which is always existing, as the owner of the node is * tested to exist and deleted or replaced by domid 0 if not). * So not finding the related domain MUST be an error in the * data base. */ errno = ENOENT; corrupt(NULL, "Accounting called for non-existing domain %u\n", domid); return ENOENT; } return 0; } void domain_watch_inc(struct connection *conn) { if (!conn || !conn->domain) return; conn->domain->nbwatch++; } void domain_watch_dec(struct connection *conn) { if (!conn || !conn->domain) return; if (conn->domain->nbwatch) conn->domain->nbwatch--; } int domain_watch(struct connection *conn) { return (domain_is_unprivileged(conn)) ? conn->domain->nbwatch : 0; } void domain_outstanding_inc(struct connection *conn) { if (!conn || !conn->domain) return; conn->domain->nboutstanding++; } void domain_outstanding_dec(struct connection *conn) { if (!conn || !conn->domain) return; conn->domain->nboutstanding--; } void domain_outstanding_domid_dec(unsigned int domid) { struct domain *d = find_domain_by_domid(domid); if (d) d->nboutstanding--; } static wrl_creditt wrl_config_writecost = WRL_FACTOR; static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR; static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR; static wrl_creditt wrl_config_gburst = WRL_GBURST * WRL_FACTOR; static wrl_creditt wrl_config_newdoms_dburst = WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR; long wrl_ntransactions; static long wrl_ndomains; static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */ static time_t wrl_log_last_warning; /* 0: no previous warning */ #define trace_wrl(...) \ do { \ if (trace_flags & TRACE_WRL) \ trace("wrl: " __VA_ARGS__); \ } while (0) void wrl_gettime_now(struct wrl_timestampt *now_wt) { struct timespec now_ts; int r; r = clock_gettime(CLOCK_MONOTONIC, &now_ts); if (r) barf_perror("Could not find time (clock_gettime failed)"); now_wt->sec = now_ts.tv_sec; now_wt->msec = now_ts.tv_nsec / 1000000; } static void wrl_xfer_credit(wrl_creditt *debit, wrl_creditt debit_floor, wrl_creditt *credit, wrl_creditt credit_ceil) /* * Transfers zero or more credit from "debit" to "credit". * Transfers as much as possible while maintaining * debit >= debit_floor and credit <= credit_ceil. * (If that's violated already, does nothing.) * * Sufficient conditions to avoid overflow, either of: * |every argument| <= 0x3fffffff * |every argument| <= 1E9 * |every argument| <= WRL_CREDIT_MAX * (And this condition is preserved.) */ { wrl_creditt xfer = MIN( *debit - debit_floor, credit_ceil - *credit ); if (xfer > 0) { *debit -= xfer; *credit += xfer; } } void wrl_domain_new(struct domain *domain) { domain->wrl_credit = 0; wrl_gettime_now(&domain->wrl_timestamp); wrl_ndomains++; /* Steal up to DBURST from the reserve */ wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst, &domain->wrl_credit, wrl_config_dburst); } void wrl_domain_destroy(struct domain *domain) { wrl_ndomains--; /* * Don't bother recalculating domain's credit - this just * means we don't give the reserve the ending domain's credit * for time elapsed since last update. */ wrl_xfer_credit(&domain->wrl_credit, 0, &wrl_reserve, wrl_config_dburst); } void wrl_credit_update(struct domain *domain, struct wrl_timestampt now) { /* * We want to calculate * credit += (now - timestamp) * RATE / ndoms; * But we want it to saturate, and to avoid floating point. * To avoid rounding errors from constantly adding small * amounts of credit, we only add credit for whole milliseconds. */ long seconds = now.sec - domain->wrl_timestamp.sec; long milliseconds = now.msec - domain->wrl_timestamp.msec; long msec; int64_t denom, num; wrl_creditt surplus; seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */ msec = seconds * 1000 + milliseconds; if (msec < 0) /* shouldn't happen with CLOCK_MONOTONIC */ msec = 0; /* 32x32 -> 64 cannot overflow */ denom = (int64_t)msec * wrl_config_rate; num = (int64_t)wrl_ndomains * 1000; /* denom / num <= 1E6 * wrl_config_rate, so with reasonable wrl_config_rate, denom / num << 2^64 */ /* at last! */ domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num, WRL_CREDIT_MAX ); /* (maybe briefly violating the DBURST cap on wrl_credit) */ /* maybe take from the reserve to make us nonnegative */ wrl_xfer_credit(&wrl_reserve, 0, &domain->wrl_credit, 0); /* return any surplus (over DBURST) to the reserve */ surplus = 0; wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst, &surplus, WRL_CREDIT_MAX); wrl_xfer_credit(&surplus, 0, &wrl_reserve, wrl_config_gburst); /* surplus is now implicitly discarded */ domain->wrl_timestamp = now; trace_wrl("dom %4d %6ld msec %9ld credit %9ld reserve %9ld discard\n", domain->domid, msec, (long)domain->wrl_credit, (long)wrl_reserve, (long)surplus); } void wrl_check_timeout(struct domain *domain, struct wrl_timestampt now, int *ptimeout) { uint64_t num, denom; int wakeup; wrl_credit_update(domain, now); if (domain->wrl_credit >= 0) /* not blocked */ return; if (!*ptimeout) /* already decided on immediate wakeup, so no need to calculate our timeout */ return; /* calculate wakeup = now + -credit / (RATE / ndoms); */ /* credit cannot go more -ve than one transaction, * so the first multiplication cannot overflow even 32-bit */ num = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains; denom = wrl_config_rate; wakeup = MIN( num / denom /* uint64_t */, INT_MAX ); if (*ptimeout==-1 || wakeup < *ptimeout) *ptimeout = wakeup; trace_wrl("domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n", domain->domid, (long)domain->wrl_credit, (long)wrl_reserve, wakeup); } #define WRL_LOG(now, ...) \ (syslog(LOG_WARNING, "write rate limit: " __VA_ARGS__)) void wrl_apply_debit_actual(struct domain *domain) { struct wrl_timestampt now; if (!domain || !domid_is_unprivileged(domain->domid)) /* sockets and privileged domain escape the write rate limit */ return; wrl_gettime_now(&now); wrl_credit_update(domain, now); domain->wrl_credit -= wrl_config_writecost; trace_wrl("domain %u credit=%ld (reserve=%ld)\n", domain->domid, (long)domain->wrl_credit, (long)wrl_reserve); if (domain->wrl_credit < 0) { if (!domain->wrl_delay_logged) { domain->wrl_delay_logged = true; WRL_LOG(now, "domain %ld is affected\n", (long)domain->domid); } else if (!wrl_log_last_warning) { WRL_LOG(now, "rate limiting restarts\n"); } wrl_log_last_warning = now.sec; } } void wrl_log_periodic(struct wrl_timestampt now) { if (wrl_log_last_warning && (now.sec - wrl_log_last_warning) > WRL_LOGEVERY) { WRL_LOG(now, "not in force recently\n"); wrl_log_last_warning = 0; } } void wrl_apply_debit_direct(struct connection *conn) { if (!conn) /* some writes are generated internally */ return; if (conn->transaction) /* these are accounted for when the transaction ends */ return; if (!wrl_ntransactions) /* we don't conflict with anyone */ return; wrl_apply_debit_actual(conn->domain); } void wrl_apply_debit_trans_commit(struct connection *conn) { if (wrl_ntransactions <= 1) /* our own transaction appears in the counter */ return; wrl_apply_debit_actual(conn->domain); } const char *dump_state_connections(FILE *fp) { const char *ret = NULL; unsigned int conn_id = 1; struct xs_state_connection sc; struct xs_state_record_header head; struct connection *c; list_for_each_entry(c, &connections, list) { head.type = XS_STATE_TYPE_CONN; head.length = sizeof(sc); sc.conn_id = conn_id++; sc.pad = 0; memset(&sc.spec, 0, sizeof(sc.spec)); if (c->domain) { sc.conn_type = XS_STATE_CONN_TYPE_RING; sc.spec.ring.domid = c->id; sc.spec.ring.tdomid = c->target ? c->target->id : DOMID_INVALID; sc.spec.ring.evtchn = c->domain->port; } else { sc.conn_type = XS_STATE_CONN_TYPE_SOCKET; sc.spec.socket_fd = c->fd; } ret = dump_state_buffered_data(NULL, c, &sc); if (ret) return ret; head.length += sc.data_in_len + sc.data_out_len; head.length = ROUNDUP(head.length, 3); if (fwrite(&head, sizeof(head), 1, fp) != 1) return "Dump connection state error"; if (fwrite(&sc, offsetof(struct xs_state_connection, data), 1, fp) != 1) return "Dump connection state error"; ret = dump_state_buffered_data(fp, c, NULL); if (ret) return ret; ret = dump_state_align(fp); if (ret) return ret; ret = dump_state_watches(fp, c, sc.conn_id); if (ret) return ret; } return ret; } void read_state_connection(const void *ctx, const void *state) { const struct xs_state_connection *sc = state; struct connection *conn; struct domain *domain, *tdomain; if (sc->conn_type == XS_STATE_CONN_TYPE_SOCKET) { #ifdef NO_SOCKETS barf("socket based connection without sockets"); #else conn = new_connection(&socket_funcs); if (!conn) barf("error restoring connection"); conn->fd = sc->spec.socket_fd; #endif } else { domain = introduce_domain(ctx, sc->spec.ring.domid, sc->spec.ring.evtchn, true); if (!domain) barf("domain allocation error"); conn = domain->conn; /* * We may not have been able to restore the domain (for * instance because it revoked the Xenstore grant). We need * to keep it around to send @releaseDomain when it is * dead. So mark it as ignored. */ if (!domain->port || !domain->interface) ignore_connection(conn, XENSTORE_ERROR_COMM); if (sc->spec.ring.tdomid != DOMID_INVALID) { tdomain = find_or_alloc_domain(ctx, sc->spec.ring.tdomid); if (!tdomain) barf("target domain allocation error"); talloc_reference(domain->conn, tdomain->conn); domain->conn->target = tdomain->conn; } } conn->conn_id = sc->conn_id; read_state_buffered_data(ctx, conn, sc); } struct domain_acc { unsigned int domid; int nodes; }; static int domain_check_acc_init_sub(const void *k, void *v, void *arg) { struct hashtable *domains = arg; struct domain *d = v; struct domain_acc *dom; dom = talloc_zero(NULL, struct domain_acc); if (!dom) return -1; dom->domid = d->domid; /* * Set the initial value to the negative one of the current domain. * If everything is correct incrementing the value for each node will * result in dom->nodes being 0 at the end. */ dom->nodes = -d->nbentry; if (!hashtable_insert(domains, &dom->domid, dom)) { talloc_free(dom); return -1; } return 0; } struct hashtable *domain_check_acc_init(void) { struct hashtable *domains; domains = create_hashtable(NULL, 8, domhash_fn, domeq_fn, HASHTABLE_FREE_VALUE); if (!domains) return NULL; if (hashtable_iterate(domhash, domain_check_acc_init_sub, domains)) { hashtable_destroy(domains); return NULL; } return domains; } void domain_check_acc_add(const struct node *node, struct hashtable *domains) { struct domain_acc *dom; unsigned int domid; domid = get_node_owner(node); dom = hashtable_search(domains, &domid); if (!dom) log("Node %s owned by unknown domain %u", node->name, domid); else dom->nodes++; } static int domain_check_acc_cb(const void *k, void *v, void *arg) { struct domain_acc *dom = v; struct domain *d; if (!dom->nodes) return 0; log("Correct accounting data for domain %u: nodes are %d off", dom->domid, dom->nodes); d = find_domain_struct(dom->domid); if (!d) return 0; d->nbentry += dom->nodes; return 0; } void domain_check_acc(struct hashtable *domains) { hashtable_iterate(domains, domain_check_acc_cb, NULL); } /* * Local variables: * mode: C * c-file-style: "linux" * indent-tabs-mode: t * c-basic-offset: 8 * tab-width: 8 * End: */