/******************************************************************************
* common/grant_table.c
*
* Mechanism for granting foreign access to page frames, and receiving
* page-ownership transfers.
*
* Copyright (c) 2005-2006 Christopher Clark
* Copyright (c) 2004 K A Fraser
* Copyright (c) 2005 Andrew Warfield
* Modifications by Geoffrey Lefebvre are (c) Intel Research Cambridge
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef CONFIG_PV_SHIM
#include
#endif
/* Per-domain grant information. */
struct grant_table {
/*
* Lock protecting updates to grant table state (version, active
* entry list, etc.)
*/
percpu_rwlock_t lock;
/* Lock protecting the maptrack limit */
spinlock_t maptrack_lock;
unsigned int max_version;
/*
* Defaults to v1. May be changed with GNTTABOP_set_version. All other
* values are invalid.
*/
unsigned int gt_version;
/* Resource limits of the domain. */
unsigned int max_grant_frames;
unsigned int max_maptrack_frames;
/* Table size. Number of frames shared with guest */
unsigned int nr_grant_frames;
/* Number of grant status frames shared with guest (for version 2) */
unsigned int nr_status_frames;
/*
* Number of available maptrack entries. For cleanup purposes it is
* important to realize that this field and @maptrack further down will
* only ever be accessed by the local domain. Thus it is okay to clean
* up early, and to shrink the limit for the purpose of tracking cleanup
* progress.
*/
unsigned int maptrack_limit;
/* Shared grant table (see include/public/grant_table.h). */
union {
void **shared_raw;
struct grant_entry_v1 **shared_v1;
union grant_entry_v2 **shared_v2;
};
/* State grant table (see include/public/grant_table.h). */
grant_status_t **status;
/* Active grant table. */
struct active_grant_entry **active;
/* Handle-indexed tracking table of mappings. */
struct grant_mapping **maptrack;
/*
* MFN-indexed tracking tree of mappings, if needed. Note that this is
* protected by @lock, not @maptrack_lock.
*/
struct radix_tree_root maptrack_tree;
/* Domain to which this struct grant_table belongs. */
struct domain *domain;
};
unsigned int __read_mostly opt_max_grant_frames = 64;
static unsigned int __read_mostly opt_max_maptrack_frames = 1024;
#ifdef CONFIG_HYPFS
#define GRANT_CUSTOM_VAL_SZ 12
static char __read_mostly opt_max_grant_frames_val[GRANT_CUSTOM_VAL_SZ];
static char __read_mostly opt_max_maptrack_frames_val[GRANT_CUSTOM_VAL_SZ];
static void update_gnttab_par(unsigned int val, struct param_hypfs *par,
char *parval)
{
snprintf(parval, GRANT_CUSTOM_VAL_SZ, "%u", val);
custom_runtime_set_var_sz(par, parval, GRANT_CUSTOM_VAL_SZ);
}
static void __init cf_check gnttab_max_frames_init(struct param_hypfs *par)
{
update_gnttab_par(opt_max_grant_frames, par, opt_max_grant_frames_val);
}
static void __init cf_check max_maptrack_frames_init(struct param_hypfs *par)
{
update_gnttab_par(opt_max_maptrack_frames, par,
opt_max_maptrack_frames_val);
}
#else
#define update_gnttab_par(v, unused1, unused2) update_gnttab_par(v)
#define parse_gnttab_limit(a, v, unused1, unused2) parse_gnttab_limit(a, v)
static void update_gnttab_par(unsigned int val, struct param_hypfs *par,
char *parval)
{
}
#endif
static int parse_gnttab_limit(const char *arg, unsigned int *valp,
struct param_hypfs *par, char *parval)
{
const char *e;
unsigned long val;
val = simple_strtoul(arg, &e, 0);
if ( *e )
return -EINVAL;
if ( val > INT_MAX )
return -ERANGE;
*valp = val;
update_gnttab_par(val, par, parval);
return 0;
}
static int cf_check parse_gnttab_max_frames(const char *arg);
custom_runtime_param("gnttab_max_frames", parse_gnttab_max_frames,
gnttab_max_frames_init);
static int cf_check parse_gnttab_max_frames(const char *arg)
{
return parse_gnttab_limit(arg, &opt_max_grant_frames,
param_2_parfs(parse_gnttab_max_frames),
opt_max_grant_frames_val);
}
static int cf_check parse_gnttab_max_maptrack_frames(const char *arg);
custom_runtime_param("gnttab_max_maptrack_frames",
parse_gnttab_max_maptrack_frames,
max_maptrack_frames_init);
static int cf_check parse_gnttab_max_maptrack_frames(const char *arg)
{
return parse_gnttab_limit(arg, &opt_max_maptrack_frames,
param_2_parfs(parse_gnttab_max_maptrack_frames),
opt_max_maptrack_frames_val);
}
#ifndef GNTTAB_MAX_VERSION
#define GNTTAB_MAX_VERSION 2
#endif
unsigned int __read_mostly opt_gnttab_max_version = GNTTAB_MAX_VERSION;
static bool __read_mostly opt_transitive_grants = true;
#ifdef CONFIG_PV
static bool __ro_after_init opt_grant_transfer = true;
#else
#define opt_grant_transfer false
#endif
static int __init cf_check parse_gnttab(const char *s)
{
const char *ss, *e;
int val, rc = 0;
do {
ss = strchr(s, ',');
if ( !ss )
ss = strchr(s, '\0');
if ( !strncmp(s, "max-ver:", 8) ||
!strncmp(s, "max_ver:", 8) ) /* Alias for original XSA-226 patch */
{
long ver = simple_strtol(s + 8, &e, 10);
if ( e == ss && ver >= 1 && ver <= 2 )
opt_gnttab_max_version = ver;
else
rc = -EINVAL;
}
else if ( (val = parse_boolean("transitive", s, ss)) >= 0 )
opt_transitive_grants = val;
#ifndef opt_grant_transfer
else if ( (val = parse_boolean("transfer", s, ss)) >= 0 )
opt_grant_transfer = val;
#endif
else
rc = -EINVAL;
s = ss + 1;
} while ( *ss );
return rc;
}
custom_param("gnttab", parse_gnttab);
/*
* Note that the three values below are effectively part of the ABI, even if
* we don't need to make them a formal part of it: A guest suspended for
* migration in the middle of a continuation would fail to work if resumed on
* a hypervisor using different values.
*/
#define GNTTABOP_CONTINUATION_ARG_SHIFT 12
#define GNTTABOP_CMD_MASK ((1<nr_grant_frames;
}
/* Number of status grant table frames. Caller must hold d's gr. table lock.*/
static inline unsigned int nr_status_frames(const struct grant_table *gt)
{
return gt->nr_status_frames;
}
#define MAPTRACK_PER_PAGE (PAGE_SIZE / sizeof(struct grant_mapping))
#define maptrack_entry(t, e) \
((t)->maptrack[array_index_nospec(e, (t)->maptrack_limit) / \
MAPTRACK_PER_PAGE][(e) % MAPTRACK_PER_PAGE])
static inline unsigned int
nr_maptrack_frames(struct grant_table *t)
{
return t->maptrack_limit / MAPTRACK_PER_PAGE;
}
#define MAPTRACK_TAIL (~0u)
#define SHGNT_PER_PAGE_V1 (PAGE_SIZE / sizeof(grant_entry_v1_t))
#define shared_entry_v1(t, e) \
((t)->shared_v1[(e)/SHGNT_PER_PAGE_V1][(e)%SHGNT_PER_PAGE_V1])
#define SHGNT_PER_PAGE_V2 (PAGE_SIZE / sizeof(grant_entry_v2_t))
#define shared_entry_v2(t, e) \
((t)->shared_v2[(e)/SHGNT_PER_PAGE_V2][(e)%SHGNT_PER_PAGE_V2])
#define STGNT_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t))
#define status_entry(t, e) \
((t)->status[(e)/STGNT_PER_PAGE][(e)%STGNT_PER_PAGE])
static grant_entry_header_t *
shared_entry_header(struct grant_table *t, grant_ref_t ref)
{
switch ( t->gt_version )
{
case 1:
/* Returned values should be independent of speculative execution */
block_speculation();
return (grant_entry_header_t*)&shared_entry_v1(t, ref);
case 2:
/* Returned values should be independent of speculative execution */
block_speculation();
return &shared_entry_v2(t, ref).hdr;
}
ASSERT_UNREACHABLE();
block_speculation();
return NULL;
}
/* Active grant entry - used for shadowing GTF_permit_access grants. */
struct active_grant_entry {
/*
* 4x byte-wide reference counts, for {host,device}{read,write} mappings,
* implemented as a single 32-bit (presumably to optimise checking for any
* reference).
*/
uint32_t pin;
/* Width of the individual counter fields. */
#define GNTPIN_cntr_width 8
#define GNTPIN_cntr_mask ((1U << GNTPIN_cntr_width) - 1)
/* Count of writable host-CPU mappings. */
#define GNTPIN_hstw_shift 0
#define GNTPIN_hstw_inc (1U << GNTPIN_hstw_shift)
#define GNTPIN_hstw_mask (GNTPIN_cntr_mask << GNTPIN_hstw_shift)
/* Count of read-only host-CPU mappings. */
#define GNTPIN_hstr_shift (GNTPIN_hstw_shift + GNTPIN_cntr_width)
#define GNTPIN_hstr_inc (1U << GNTPIN_hstr_shift)
#define GNTPIN_hstr_mask (GNTPIN_cntr_mask << GNTPIN_hstr_shift)
/* Count of writable device-bus mappings. */
#define GNTPIN_devw_shift (GNTPIN_hstr_shift + GNTPIN_cntr_width)
#define GNTPIN_devw_inc (1U << GNTPIN_devw_shift)
#define GNTPIN_devw_mask (GNTPIN_cntr_mask << GNTPIN_devw_shift)
/* Count of read-only device-bus mappings. */
#define GNTPIN_devr_shift (GNTPIN_devw_shift + GNTPIN_cntr_width)
#define GNTPIN_devr_inc (1U << GNTPIN_devr_shift)
#define GNTPIN_devr_mask (GNTPIN_cntr_mask << GNTPIN_devr_shift)
/* Convert a combination of GNTPIN_*_inc to an overflow checking mask. */
#define GNTPIN_incr2oflow_mask(x) ({ \
ASSERT(!((x) & ~(GNTPIN_hstw_inc | GNTPIN_hstr_inc | \
GNTPIN_devw_inc | GNTPIN_devr_inc))); \
(x) << (GNTPIN_cntr_width - 1); \
})
domid_t domid; /* Domain being granted access. */
domid_t src_domid; /* Original domain granting access. */
unsigned int start:15; /* For sub-page grants, the start offset
in the page. */
bool is_sub_page:1; /* True if this is a sub-page grant. */
unsigned int length:16; /* For sub-page grants, the length of the
grant. */
grant_ref_t trans_gref;
mfn_t mfn; /* Machine frame being granted. */
#ifndef NDEBUG
gfn_t gfn; /* Guest's idea of the frame being granted. */
#endif
spinlock_t lock; /* lock to protect access of this entry.
see docs/misc/grant-tables.txt for
locking protocol */
};
#define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
#define _active_entry(t, e) \
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn)
{
#ifndef NDEBUG
act->gfn = gfn;
#endif
}
static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
static inline void grant_read_lock(struct grant_table *gt)
{
percpu_read_lock(grant_rwlock, >->lock);
}
static inline void grant_read_unlock(struct grant_table *gt)
{
percpu_read_unlock(grant_rwlock, >->lock);
}
static inline void grant_write_lock(struct grant_table *gt)
{
percpu_write_lock(grant_rwlock, >->lock);
}
static inline void grant_write_unlock(struct grant_table *gt)
{
percpu_write_unlock(grant_rwlock, >->lock);
}
static inline void gnttab_flush_tlb(const struct domain *d)
{
if ( !paging_mode_external(d) )
arch_flush_tlb_mask(d->dirty_cpumask);
}
static inline unsigned int
num_act_frames_from_sha_frames(const unsigned int num)
{
/*
* How many frames are needed for the active grant table,
* given the size of the shared grant table?
*/
unsigned int sha_per_page = PAGE_SIZE / sizeof(grant_entry_v1_t);
return DIV_ROUND_UP(num * sha_per_page, ACGNT_PER_PAGE);
}
#define max_nr_active_grant_frames(gt) \
num_act_frames_from_sha_frames((gt)->max_grant_frames)
static inline unsigned int
nr_active_grant_frames(struct grant_table *gt)
{
return num_act_frames_from_sha_frames(nr_grant_frames(gt));
}
static inline struct active_grant_entry *
active_entry_acquire(struct grant_table *t, grant_ref_t e)
{
struct active_grant_entry *act;
/*
* The grant table for the active entry should be locked but the
* percpu rwlock cannot be checked for read lock without race conditions
* or high overhead so we cannot use an ASSERT
*
* ASSERT(rw_is_locked(&t->lock));
*/
act = &_active_entry(t, e);
spin_lock(&act->lock);
return act;
}
static inline void active_entry_release(struct active_grant_entry *act)
{
spin_unlock(&act->lock);
}
#define GRANT_STATUS_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t))
#define GRANT_PER_PAGE (PAGE_SIZE / sizeof(grant_entry_v2_t))
static inline unsigned int grant_to_status_frames(unsigned int grant_frames)
{
return DIV_ROUND_UP(grant_frames * GRANT_PER_PAGE, GRANT_STATUS_PER_PAGE);
}
static inline unsigned int status_to_grant_frames(unsigned int status_frames)
{
return DIV_ROUND_UP(status_frames * GRANT_STATUS_PER_PAGE, GRANT_PER_PAGE);
}
/* Check if the page has been paged out, or needs unsharing.
If rc == GNTST_okay, *page contains the page struct with a ref taken.
Caller must do put_page(*page).
If any error, *page = NULL, *mfn = INVALID_MFN, no ref taken. */
static int get_paged_frame(unsigned long gfn, mfn_t *mfn,
struct page_info **page, bool readonly,
struct domain *rd)
{
p2m_type_t p2mt;
int rc;
rc = check_get_page_from_gfn(rd, _gfn(gfn), readonly, &p2mt, page);
switch ( rc )
{
case 0:
break;
case -EAGAIN:
return GNTST_eagain;
default:
ASSERT_UNREACHABLE();
/* Fallthrough */
case -EINVAL:
return GNTST_bad_page;
}
if ( p2m_is_foreign(p2mt) )
{
put_page(*page);
*page = NULL;
return GNTST_bad_page;
}
*mfn = page_to_mfn(*page);
return GNTST_okay;
}
#define INVALID_MAPTRACK_HANDLE UINT_MAX
static inline grant_handle_t
_get_maptrack_handle(struct grant_table *t, struct vcpu *v)
{
unsigned int head, next;
spin_lock(&v->maptrack_freelist_lock);
/* No maptrack pages allocated for this VCPU yet? */
head = v->maptrack_head;
if ( unlikely(head == MAPTRACK_TAIL) )
{
spin_unlock(&v->maptrack_freelist_lock);
return INVALID_MAPTRACK_HANDLE;
}
/*
* Always keep one entry in the free list to make it easier to
* add free entries to the tail.
*/
next = maptrack_entry(t, head).ref;
if ( unlikely(next == MAPTRACK_TAIL) )
head = INVALID_MAPTRACK_HANDLE;
else
v->maptrack_head = next;
spin_unlock(&v->maptrack_freelist_lock);
return head;
}
/*
* Try to "steal" a free maptrack entry from another VCPU.
*
* A stolen entry is transferred to the thief, so the number of
* entries for each VCPU should tend to the usage pattern.
*
* To avoid having to atomically count the number of free entries on
* each VCPU and to avoid two VCPU repeatedly stealing entries from
* each other, the initial victim VCPU is selected randomly.
*/
static grant_handle_t steal_maptrack_handle(struct grant_table *t,
const struct vcpu *curr)
{
const struct domain *currd = curr->domain;
unsigned int first, i;
/* Find an initial victim. */
first = i = get_random() % currd->max_vcpus;
do {
if ( currd->vcpu[i] )
{
grant_handle_t handle;
handle = _get_maptrack_handle(t, currd->vcpu[i]);
if ( handle != INVALID_MAPTRACK_HANDLE )
{
maptrack_entry(t, handle).vcpu = curr->vcpu_id;
return handle;
}
}
i++;
if ( i == currd->max_vcpus )
i = 0;
} while ( i != first );
/* No free handles on any VCPU. */
return INVALID_MAPTRACK_HANDLE;
}
static inline void
put_maptrack_handle(
struct grant_table *t, grant_handle_t handle)
{
struct domain *currd = current->domain;
struct vcpu *v;
unsigned int tail;
/* 1. Set entry to be a tail. */
maptrack_entry(t, handle).ref = MAPTRACK_TAIL;
/* 2. Add entry to the tail of the list on the original VCPU. */
v = currd->vcpu[maptrack_entry(t, handle).vcpu];
spin_lock(&v->maptrack_freelist_lock);
tail = v->maptrack_tail;
v->maptrack_tail = handle;
/* 3. Update the old tail entry to point to the new entry. */
maptrack_entry(t, tail).ref = handle;
spin_unlock(&v->maptrack_freelist_lock);
}
static inline grant_handle_t
get_maptrack_handle(
struct grant_table *lgt)
{
struct vcpu *curr = current;
unsigned int i;
grant_handle_t handle;
struct grant_mapping *new_mt = NULL;
handle = _get_maptrack_handle(lgt, curr);
if ( likely(handle != INVALID_MAPTRACK_HANDLE) )
return handle;
spin_lock(&lgt->maptrack_lock);
/*
* If we've run out of handles and still have frame headroom, try
* allocating a new maptrack frame. If there is no headroom, or we're
* out of memory, try stealing an entry from another VCPU (in case the
* guest isn't mapping across its VCPUs evenly).
*/
if ( nr_maptrack_frames(lgt) < lgt->max_maptrack_frames )
new_mt = alloc_xenheap_page();
if ( !new_mt )
{
spin_unlock(&lgt->maptrack_lock);
/*
* Uninitialized free list? Steal an extra entry for the tail
* sentinel.
*/
if ( curr->maptrack_tail == MAPTRACK_TAIL )
{
handle = steal_maptrack_handle(lgt, curr);
if ( handle == INVALID_MAPTRACK_HANDLE )
return handle;
spin_lock(&curr->maptrack_freelist_lock);
maptrack_entry(lgt, handle).ref = MAPTRACK_TAIL;
curr->maptrack_tail = handle;
if ( curr->maptrack_head == MAPTRACK_TAIL )
curr->maptrack_head = handle;
spin_unlock(&curr->maptrack_freelist_lock);
}
return steal_maptrack_handle(lgt, curr);
}
clear_page(new_mt);
/*
* Use the first new entry and add the remaining entries to the
* head of the free list.
*/
handle = lgt->maptrack_limit;
for ( i = 0; i < MAPTRACK_PER_PAGE; i++ )
{
BUILD_BUG_ON(sizeof(new_mt->ref) < sizeof(handle));
new_mt[i].ref = handle + i + 1;
new_mt[i].vcpu = curr->vcpu_id;
}
/* Set tail directly if this is the first page for the local vCPU. */
if ( curr->maptrack_tail == MAPTRACK_TAIL )
curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
smp_wmb();
lgt->maptrack_limit += MAPTRACK_PER_PAGE;
spin_unlock(&lgt->maptrack_lock);
spin_lock(&curr->maptrack_freelist_lock);
new_mt[i - 1].ref = curr->maptrack_head;
curr->maptrack_head = handle + 1;
spin_unlock(&curr->maptrack_freelist_lock);
return handle;
}
/* Number of grant table entries. Caller must hold d's grant table lock. */
static unsigned int nr_grant_entries(struct grant_table *gt)
{
switch ( gt->gt_version )
{
#define f2e(nr, ver) (((nr) << PAGE_SHIFT) / sizeof(grant_entry_v##ver##_t))
case 1:
BUILD_BUG_ON(f2e(INITIAL_NR_GRANT_FRAMES, 1) <
GNTTAB_NR_RESERVED_ENTRIES);
/* Make sure we return a value independently of speculative execution */
block_speculation();
return f2e(nr_grant_frames(gt), 1);
case 2:
BUILD_BUG_ON(f2e(INITIAL_NR_GRANT_FRAMES, 2) <
GNTTAB_NR_RESERVED_ENTRIES);
/* Make sure we return a value independently of speculative execution */
block_speculation();
return f2e(nr_grant_frames(gt), 2);
#undef f2e
}
ASSERT_UNREACHABLE();
block_speculation();
return 0;
}
static int _set_status_v1(const grant_entry_header_t *shah,
struct domain *rd,
struct active_grant_entry *act,
int readonly,
int mapflag,
domid_t ldomid)
{
int rc = GNTST_okay;
uint32_t *raw_shah = (uint32_t *)shah;
union grant_combo scombo;
uint16_t mask = GTF_type_mask;
/*
* We bound the number of times we retry CMPXCHG on memory locations that
* we share with a guest OS. The reason is that the guest can modify that
* location at a higher rate than we can read-modify-CMPXCHG, so the guest
* could cause us to livelock. There are a few cases where it is valid for
* the guest to race our updates (e.g., to change the GTF_readonly flag),
* so we allow a few retries before failing.
*/
int retries = 0;
/* if this is a grant mapping operation we should ensure GTF_sub_page
is not set */
if ( mapflag )
mask |= GTF_sub_page;
scombo.raw = ACCESS_ONCE(*raw_shah);
/*
* This loop attempts to set the access (reading/writing) flags
* in the grant table entry. It tries a cmpxchg on the field
* up to five times, and then fails under the assumption that
* the guest is misbehaving.
*/
for ( ; ; )
{
union grant_combo prev, new;
/* If not already pinned, check the grant domid and type. */
if ( !act->pin && (((scombo.flags & mask) != GTF_permit_access) ||
(scombo.domid != ldomid)) )
PIN_FAIL(done, GNTST_general_error,
"Bad flags (%x) or dom (%d); expected d%d\n",
scombo.flags, scombo.domid, ldomid);
new = scombo;
new.flags |= GTF_reading;
if ( !readonly )
{
new.flags |= GTF_writing;
if ( unlikely(scombo.flags & GTF_readonly) )
PIN_FAIL(done, GNTST_general_error,
"Attempt to write-pin a r/o grant entry\n");
}
prev.raw = guest_cmpxchg(rd, raw_shah, scombo.raw, new.raw);
if ( likely(prev.raw == scombo.raw) )
break;
if ( retries++ == 4 )
PIN_FAIL(done, GNTST_general_error,
"Shared grant entry is unstable\n");
scombo = prev;
}
done:
return rc;
}
static int _set_status_v2(const grant_entry_header_t *shah,
grant_status_t *status,
struct domain *rd,
struct active_grant_entry *act,
int readonly,
int mapflag,
domid_t ldomid)
{
int rc = GNTST_okay;
uint32_t *raw_shah = (uint32_t *)shah;
union grant_combo scombo;
uint16_t mask = GTF_type_mask;
scombo.raw = ACCESS_ONCE(*raw_shah);
/* if this is a grant mapping operation we should ensure GTF_sub_page
is not set */
if ( mapflag )
mask |= GTF_sub_page;
/* If not already pinned, check the grant domid and type. */
if ( !act->pin &&
((((scombo.flags & mask) != GTF_permit_access) &&
(mapflag || ((scombo.flags & mask) != GTF_transitive))) ||
(scombo.domid != ldomid)) )
PIN_FAIL(done, GNTST_general_error,
"Bad flags (%x) or dom (%d); expected d%d, flags %x\n",
scombo.flags, scombo.domid, ldomid, mask);
if ( readonly )
{
*status |= GTF_reading;
}
else
{
if ( unlikely(scombo.flags & GTF_readonly) )
PIN_FAIL(done, GNTST_general_error,
"Attempt to write-pin a r/o grant entry\n");
*status |= GTF_reading | GTF_writing;
}
/* Make sure guest sees status update before checking if flags are
still valid */
smp_mb();
scombo.raw = ACCESS_ONCE(*raw_shah);
if ( !act->pin )
{
if ( (((scombo.flags & mask) != GTF_permit_access) &&
(mapflag || ((scombo.flags & mask) != GTF_transitive))) ||
(scombo.domid != ldomid) ||
(!readonly && (scombo.flags & GTF_readonly)) )
{
gnttab_clear_flags(rd, GTF_writing | GTF_reading, status);
PIN_FAIL(done, GNTST_general_error,
"Unstable flags (%x) or dom (%d); expected d%d (r/w: %d)\n",
scombo.flags, scombo.domid, ldomid, !readonly);
}
}
else
{
if ( unlikely(scombo.flags & GTF_readonly) )
{
gnttab_clear_flags(rd, GTF_writing, status);
PIN_FAIL(done, GNTST_general_error,
"Unstable grant readonly flag\n");
}
}
done:
return rc;
}
static int _set_status(const grant_entry_header_t *shah,
grant_status_t *status,
struct domain *rd,
unsigned int rgt_version,
struct active_grant_entry *act,
int readonly,
int mapflag,
domid_t ldomid)
{
if ( evaluate_nospec(rgt_version == 1) )
return _set_status_v1(shah, rd, act, readonly, mapflag, ldomid);
else
return _set_status_v2(shah, status, rd, act, readonly, mapflag, ldomid);
}
/*
* The status for a grant may indicate that we're taking more access than
* the pin requires. Reduce the status to match the pin. Called with the
* domain's grant table lock held at least in read mode and with the active
* entry lock held (iow act->pin can't change behind our backs).
*/
static void reduce_status_for_pin(struct domain *rd,
const struct active_grant_entry *act,
uint16_t *status, bool readonly)
{
unsigned int clear_flags = act->pin ? 0 : GTF_reading;
if ( !readonly && !(act->pin & (GNTPIN_hstw_mask | GNTPIN_devw_mask)) )
clear_flags |= GTF_writing;
if ( clear_flags )
gnttab_clear_flags(rd, clear_flags, status);
}
static struct active_grant_entry *grant_map_exists(const struct domain *ld,
struct grant_table *rgt,
mfn_t mfn,
grant_ref_t *cur_ref)
{
grant_ref_t ref, max_iter;
/*
* The remote grant table should be locked but the percpu rwlock
* cannot be checked for read lock without race conditions or high
* overhead so we cannot use an ASSERT
*
* ASSERT(rw_is_locked(&rgt->lock));
*/
max_iter = min(*cur_ref + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
nr_grant_entries(rgt));
for ( ref = *cur_ref; ref < max_iter; ref++ )
{
struct active_grant_entry *act = active_entry_acquire(rgt, ref);
if ( act->pin && act->domid == ld->domain_id &&
mfn_eq(act->mfn, mfn) )
return act;
active_entry_release(act);
}
if ( ref < nr_grant_entries(rgt) )
{
*cur_ref = ref;
return NULL;
}
return ERR_PTR(-EINVAL);
}
union maptrack_node {
struct {
/* Radix tree slot pointers use two of the bits. */
#ifdef __BIG_ENDIAN_BITFIELD
unsigned long _0 : 2;
#endif
unsigned long rd : BITS_PER_LONG / 2 - 1;
unsigned long wr : BITS_PER_LONG / 2 - 1;
#ifndef __BIG_ENDIAN_BITFIELD
unsigned long _0 : 2;
#endif
} cnt;
unsigned long raw;
};
static void
map_grant_ref(
struct gnttab_map_grant_ref *op)
{
struct domain *ld, *rd, *owner = NULL;
struct grant_table *lgt, *rgt;
grant_ref_t ref;
grant_handle_t handle;
mfn_t mfn;
struct page_info *pg = NULL;
int rc = GNTST_okay;
unsigned int cache_flags, refcnt = 0, typecnt = 0, pin_incr = 0;
bool host_map_created = false;
struct active_grant_entry *act = NULL;
struct grant_mapping *mt;
grant_entry_header_t *shah;
uint16_t *status;
ld = current->domain;
if ( op->flags & GNTMAP_device_map )
pin_incr += (op->flags & GNTMAP_readonly) ? GNTPIN_devr_inc
: GNTPIN_devw_inc;
if ( op->flags & GNTMAP_host_map )
pin_incr += (op->flags & GNTMAP_readonly) ? GNTPIN_hstr_inc
: GNTPIN_hstw_inc;
if ( unlikely(!pin_incr) )
{
gdprintk(XENLOG_INFO, "Bad flags in grant map op: %x\n", op->flags);
op->status = GNTST_bad_gntref;
return;
}
if ( unlikely(paging_mode_external(ld) &&
(op->flags & (GNTMAP_device_map|GNTMAP_application_map|
GNTMAP_contains_pte))) )
{
gdprintk(XENLOG_INFO, "No device mapping in HVM domain\n");
op->status = GNTST_general_error;
return;
}
if ( unlikely((rd = rcu_lock_domain_by_id(op->dom)) == NULL) )
{
gdprintk(XENLOG_INFO, "Could not find domain %d\n", op->dom);
op->status = GNTST_bad_domain;
return;
}
rc = xsm_grant_mapref(XSM_HOOK, ld, rd, op->flags);
if ( rc )
{
rcu_unlock_domain(rd);
op->status = GNTST_permission_denied;
return;
}
lgt = ld->grant_table;
handle = get_maptrack_handle(lgt);
if ( unlikely(handle == INVALID_MAPTRACK_HANDLE) )
{
rcu_unlock_domain(rd);
gdprintk(XENLOG_INFO, "Failed to obtain maptrack handle\n");
op->status = GNTST_no_space;
return;
}
rgt = rd->grant_table;
grant_read_lock(rgt);
/* Bounds check on the grant ref */
ref = op->ref;
if ( unlikely(ref >= nr_grant_entries(rgt)))
PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref %#x for d%d\n",
ref, rgt->domain->domain_id);
/* This call also ensures the above check cannot be passed speculatively */
shah = shared_entry_header(rgt, ref);
act = active_entry_acquire(rgt, ref);
/* If already pinned, check the active domid and avoid refcnt overflow. */
if ( act->pin &&
((act->domid != ld->domain_id) ||
(act->pin & GNTPIN_incr2oflow_mask(pin_incr)) ||
(act->is_sub_page)) )
PIN_FAIL(act_release_out, GNTST_general_error,
"Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n",
act->domid, ld->domain_id, act->pin, act->is_sub_page);
/* Make sure we do not access memory speculatively */
status = evaluate_nospec(rgt->gt_version == 1) ? &shah->flags
: &status_entry(rgt, ref);
if ( !act->pin ||
(!(op->flags & GNTMAP_readonly) &&
!(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask))) )
{
if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act,
op->flags & GNTMAP_readonly, 1,
ld->domain_id)) != GNTST_okay )
goto act_release_out;
if ( !act->pin )
{
unsigned long gfn = evaluate_nospec(rgt->gt_version == 1) ?
shared_entry_v1(rgt, ref).frame :
shared_entry_v2(rgt, ref).full_page.frame;
rc = get_paged_frame(gfn, &mfn, &pg,
op->flags & GNTMAP_readonly, rd);
if ( rc != GNTST_okay )
goto unlock_out_clear;
act_set_gfn(act, _gfn(gfn));
act->domid = ld->domain_id;
act->mfn = mfn;
act->start = 0;
act->length = PAGE_SIZE;
act->is_sub_page = false;
act->src_domid = rd->domain_id;
act->trans_gref = ref;
}
}
act->pin += pin_incr;
mfn = act->mfn;
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
active_entry_release(act);
grant_read_unlock(rgt);
/* pg may be set, with a refcount included, from get_paged_frame(). */
if ( !pg )
{
pg = mfn_valid(mfn) ? mfn_to_page(mfn) : NULL;
if ( pg )
owner = page_get_owner_and_reference(pg);
}
else
owner = page_get_owner(pg);
if ( owner )
refcnt++;
if ( !pg || (owner == dom_io) )
{
/* Only needed the reference to confirm dom_io ownership. */
if ( pg )
{
put_page(pg);
refcnt--;
}
if ( paging_mode_external(ld) )
{
gdprintk(XENLOG_WARNING, "HVM guests can't grant map iomem\n");
rc = GNTST_general_error;
goto undo_out;
}
if ( !iomem_access_permitted(rd, mfn_x(mfn), mfn_x(mfn)) )
{
gdprintk(XENLOG_WARNING,
"Iomem mapping not permitted %#"PRI_mfn" (domain %d)\n",
mfn_x(mfn), rd->domain_id);
rc = GNTST_general_error;
goto undo_out;
}
if ( op->flags & GNTMAP_host_map )
{
rc = create_grant_host_mapping(op->host_addr, mfn, op->flags,
cache_flags);
if ( rc != GNTST_okay )
goto undo_out;
host_map_created = true;
}
}
else if ( owner == rd || (dom_cow && owner == dom_cow) )
{
if ( (op->flags & GNTMAP_device_map) && !(op->flags & GNTMAP_readonly) )
{
if ( (owner == dom_cow) ||
!get_page_type(pg, PGT_writable_page) )
goto could_not_pin;
typecnt++;
}
if ( op->flags & GNTMAP_host_map )
{
/*
* Only need to grab another reference if device_map claimed
* the other one.
*/
if ( op->flags & GNTMAP_device_map )
{
if ( !get_page(pg, rd) )
goto could_not_pin;
refcnt++;
}
if ( gnttab_host_mapping_get_page_type(op->flags & GNTMAP_readonly,
ld, rd) )
{
if ( (owner == dom_cow) ||
!get_page_type(pg, PGT_writable_page) )
goto could_not_pin;
typecnt++;
}
rc = create_grant_host_mapping(op->host_addr, mfn, op->flags, 0);
if ( rc != GNTST_okay )
goto undo_out;
host_map_created = true;
}
}
else
{
could_not_pin:
if ( !rd->is_dying )
gdprintk(XENLOG_WARNING, "Could not pin grant frame %#"PRI_mfn"\n",
mfn_x(mfn));
rc = GNTST_general_error;
goto undo_out;
}
/*
* This is deliberately not checking the page's owner: get_paged_frame()
* explicitly rejects foreign pages, and all success paths above yield
* either owner == rd or owner == dom_io (the dom_cow case is irrelevant
* as mem-sharing and IOMMU use are incompatible). The dom_io case would
* need checking separately if we compared against owner here.
*/
if ( ld != rd && gnttab_need_iommu_mapping(ld) )
{
union maptrack_node node = {
.cnt.rd = !!(op->flags & GNTMAP_readonly),
.cnt.wr = !(op->flags & GNTMAP_readonly),
};
int err;
void **slot = NULL;
unsigned int kind;
grant_write_lock(lgt);
err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn),
radix_tree_ulong_to_ptr(node.raw));
if ( err == -EEXIST )
{
slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn));
if ( likely(slot) )
{
node.raw = radix_tree_ptr_to_ulong(*slot);
err = -EBUSY;
/* Update node only when refcount doesn't overflow. */
if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd
: ++node.cnt.wr )
{
radix_tree_replace_slot(slot,
radix_tree_ulong_to_ptr(node.raw));
err = 0;
}
}
else
ASSERT_UNREACHABLE();
}
/*
* We're not translated, so we know that dfns and mfns are
* the same things, so the IOMMU entry is always 1-to-1.
*/
if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 )
kind = IOMMUF_readable | IOMMUF_writable;
else if ( (op->flags & GNTMAP_readonly) &&
node.cnt.rd == 1 && !node.cnt.wr )
kind = IOMMUF_readable;
else
kind = 0;
if ( err ||
(kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind)) )
{
if ( !err )
{
if ( slot )
{
op->flags & GNTMAP_readonly ? node.cnt.rd--
: node.cnt.wr--;
radix_tree_replace_slot(slot,
radix_tree_ulong_to_ptr(node.raw));
}
else
radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn));
}
rc = GNTST_general_error;
}
grant_write_unlock(lgt);
if ( rc != GNTST_okay )
goto undo_out;
}
TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
/*
* All maptrack entry users check mt->flags first before using the
* other fields so just ensure the flags field is stored last.
*/
mt = &maptrack_entry(lgt, handle);
mt->domid = op->dom;
mt->ref = op->ref;
smp_wmb();
write_atomic(&mt->flags, op->flags);
op->dev_bus_addr = mfn_to_maddr(mfn);
op->handle = handle;
op->status = GNTST_okay;
rcu_unlock_domain(rd);
return;
undo_out:
if ( host_map_created )
{
replace_grant_host_mapping(op->host_addr, mfn, 0, op->flags);
gnttab_flush_tlb(ld);
}
while ( typecnt-- )
put_page_type(pg);
while ( refcnt-- )
put_page(pg);
grant_read_lock(rgt);
act = active_entry_acquire(rgt, op->ref);
act->pin -= pin_incr;
unlock_out_clear:
reduce_status_for_pin(rd, act, status, op->flags & GNTMAP_readonly);
act_release_out:
active_entry_release(act);
unlock_out:
grant_read_unlock(rgt);
op->status = rc;
put_maptrack_handle(lgt, handle);
rcu_unlock_domain(rd);
}
static long
gnttab_map_grant_ref(
XEN_GUEST_HANDLE_PARAM(gnttab_map_grant_ref_t) uop, unsigned int count)
{
int i;
struct gnttab_map_grant_ref op;
for ( i = 0; i < count; i++ )
{
if ( i && hypercall_preempt_check() )
return i;
if ( unlikely(__copy_from_guest_offset(&op, uop, i, 1)) )
return -EFAULT;
map_grant_ref(&op);
if ( unlikely(__copy_to_guest_offset(uop, i, &op, 1)) )
return -EFAULT;
}
return 0;
}
static void
unmap_common(
struct gnttab_unmap_common *op)
{
domid_t dom;
struct domain *ld, *rd;
struct grant_table *lgt, *rgt;
grant_ref_t ref;
struct active_grant_entry *act;
s16 rc = 0;
struct grant_mapping *map;
unsigned int flags;
bool put_handle = false;
ld = current->domain;
lgt = ld->grant_table;
if ( unlikely(op->handle >= lgt->maptrack_limit) )
{
gdprintk(XENLOG_INFO, "Bad d%d handle %#x\n",
lgt->domain->domain_id, op->handle);
op->status = GNTST_bad_handle;
return;
}
smp_rmb();
map = &maptrack_entry(lgt, op->handle);
if ( unlikely(!read_atomic(&map->flags)) )
{
gdprintk(XENLOG_INFO, "Zero flags for d%d handle %#x\n",
lgt->domain->domain_id, op->handle);
op->status = GNTST_bad_handle;
return;
}
dom = map->domid;
if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
{
/* This can happen when a grant is implicitly unmapped. */
gdprintk(XENLOG_INFO, "Could not find domain %d\n", dom);
domain_crash(ld); /* naughty... */
return;
}
rc = xsm_grant_unmapref(XSM_HOOK, ld, rd);
if ( rc )
{
rcu_unlock_domain(rd);
op->status = GNTST_permission_denied;
return;
}
TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom);
rgt = rd->grant_table;
grant_read_lock(rgt);
op->rd = rd;
op->ref = map->ref;
ref = map->ref;
/*
* We can't assume there was no racing unmap for this maptrack entry,
* and hence we can't assume map->ref is valid for rd. While the checks
* below (with the active entry lock held) will reject any such racing
* requests, we still need to make sure we don't attempt to acquire an
* invalid lock.
*/
smp_rmb();
if ( unlikely(ref >= nr_grant_entries(rgt)) )
{
gdprintk(XENLOG_WARNING, "Unstable d%d handle %#x\n",
rgt->domain->domain_id, op->handle);
rc = GNTST_bad_handle;
flags = 0;
goto unlock_out;
}
/* Make sure the above bound check cannot be bypassed speculatively */
block_speculation();
act = active_entry_acquire(rgt, ref);
/*
* Note that we (ab)use the active entry lock here to protect against
* multiple unmaps of the same mapping here. We don't want to hold lgt's
* lock, and we only hold rgt's lock for reading (but the latter wouldn't
* be the right one anyway). Hence the easiest is to rely on a lock we
* hold anyway; see docs/misc/grant-tables.txt's "Locking" section.
*/
flags = read_atomic(&map->flags);
smp_rmb();
if ( unlikely(!flags) || unlikely(map->domid != dom) ||
unlikely(map->ref != ref) )
{
gdprintk(XENLOG_WARNING, "Unstable handle %#x\n", op->handle);
rc = GNTST_bad_handle;
goto act_release_out;
}
op->mfn = act->mfn;
if ( op->dev_bus_addr && (flags & GNTMAP_device_map) &&
unlikely(op->dev_bus_addr != mfn_to_maddr(act->mfn)) )
PIN_FAIL(act_release_out, GNTST_bad_dev_addr,
"Bus address doesn't match gntref (%"PRIx64" != %"PRIpaddr")\n",
op->dev_bus_addr, mfn_to_maddr(act->mfn));
if ( op->host_addr && (flags & GNTMAP_host_map) )
{
if ( (rc = replace_grant_host_mapping(op->host_addr,
op->mfn, op->new_addr,
flags)) < 0 )
goto act_release_out;
map->flags &= ~GNTMAP_host_map;
op->done |= GNTMAP_host_map | (flags & GNTMAP_readonly);
}
if ( op->dev_bus_addr && (flags & GNTMAP_device_map) )
{
map->flags &= ~GNTMAP_device_map;
op->done |= GNTMAP_device_map | (flags & GNTMAP_readonly);
}
if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) )
{
map->flags = 0;
put_handle = true;
}
act_release_out:
active_entry_release(act);
unlock_out:
grant_read_unlock(rgt);
if ( put_handle )
put_maptrack_handle(lgt, op->handle);
/*
* map_grant_ref() will only increment the refcount (and update the
* IOMMU) once per mapping. So we only want to decrement it once the
* maptrack handle has been put, alongside the further IOMMU update.
*
* For the second and third check, see the respective comment in
* map_grant_ref().
*/
if ( put_handle && ld != rd && gnttab_need_iommu_mapping(ld) )
{
void **slot;
union maptrack_node node;
int err = 0;
grant_write_lock(lgt);
slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn));
node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0;
/* Refcount must not underflow. */
if ( !(flags & GNTMAP_readonly ? node.cnt.rd--
: node.cnt.wr--) )
BUG();
if ( !node.raw )
err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 1);
else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr )
err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 1,
IOMMUF_readable);
if ( err )
;
else if ( !node.raw )
radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn));
else
radix_tree_replace_slot(slot,
radix_tree_ulong_to_ptr(node.raw));
grant_write_unlock(lgt);
if ( err )
rc = GNTST_general_error;
}
/* If just unmapped a writable mapping, mark as dirtied */
if ( rc == GNTST_okay && !(flags & GNTMAP_readonly) )
gnttab_mark_dirty(rd, op->mfn);
op->status = rc;
rcu_unlock_domain(rd);
}
static void
unmap_common_complete(struct gnttab_unmap_common *op)
{
struct domain *ld, *rd = op->rd;
struct grant_table *rgt;
struct active_grant_entry *act;
grant_entry_header_t *sha;
struct page_info *pg;
uint16_t *status;
if ( evaluate_nospec(!op->done) )
{
/* unmap_common() didn't do anything - nothing to complete. */
return;
}
ld = current->domain;
rcu_lock_domain(rd);
rgt = rd->grant_table;
grant_read_lock(rgt);
act = active_entry_acquire(rgt, op->ref);
sha = shared_entry_header(rgt, op->ref);
if ( evaluate_nospec(rgt->gt_version == 1) )
status = &sha->flags;
else
status = &status_entry(rgt, op->ref);
pg = !is_iomem_page(act->mfn) ? mfn_to_page(op->mfn) : NULL;
if ( op->done & GNTMAP_device_map )
{
if ( pg )
{
if ( op->done & GNTMAP_readonly )
put_page(pg);
else
put_page_and_type(pg);
}
ASSERT(act->pin & (GNTPIN_devw_mask | GNTPIN_devr_mask));
if ( op->done & GNTMAP_readonly )
act->pin -= GNTPIN_devr_inc;
else
act->pin -= GNTPIN_devw_inc;
}
if ( op->done & GNTMAP_host_map )
{
if ( pg )
{
if ( gnttab_host_mapping_get_page_type(op->done & GNTMAP_readonly,
ld, rd) )
put_page_type(pg);
put_page(pg);
}
ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask));
if ( op->done & GNTMAP_readonly )
act->pin -= GNTPIN_hstr_inc;
else
act->pin -= GNTPIN_hstw_inc;
}
reduce_status_for_pin(rd, act, status, op->done & GNTMAP_readonly);
active_entry_release(act);
grant_read_unlock(rgt);
rcu_unlock_domain(rd);
}
static void
unmap_grant_ref(
struct gnttab_unmap_grant_ref *op,
struct gnttab_unmap_common *common)
{
common->host_addr = op->host_addr;
common->dev_bus_addr = op->dev_bus_addr;
common->handle = op->handle;
/* Intialise these in case common contains old state */
common->done = 0;
common->new_addr = 0;
common->rd = NULL;
common->mfn = INVALID_MFN;
unmap_common(common);
op->status = common->status;
}
static long
gnttab_unmap_grant_ref(
XEN_GUEST_HANDLE_PARAM(gnttab_unmap_grant_ref_t) uop, unsigned int count)
{
int i, c, partial_done, done = 0;
struct gnttab_unmap_grant_ref op;
struct gnttab_unmap_common common[GNTTAB_UNMAP_BATCH_SIZE];
while ( count != 0 )
{
c = min(count, (unsigned int)GNTTAB_UNMAP_BATCH_SIZE);
partial_done = 0;
for ( i = 0; i < c; i++ )
{
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
goto fault;
unmap_grant_ref(&op, &common[i]);
++partial_done;
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
goto fault;
guest_handle_add_offset(uop, 1);
}
gnttab_flush_tlb(current->domain);
for ( i = 0; i < partial_done; i++ )
unmap_common_complete(&common[i]);
count -= c;
done += c;
if ( count && hypercall_preempt_check() )
return done;
}
return 0;
fault:
gnttab_flush_tlb(current->domain);
for ( i = 0; i < partial_done; i++ )
unmap_common_complete(&common[i]);
return -EFAULT;
}
static void
unmap_and_replace(
struct gnttab_unmap_and_replace *op,
struct gnttab_unmap_common *common)
{
common->host_addr = op->host_addr;
common->new_addr = op->new_addr;
common->handle = op->handle;
/* Intialise these in case common contains old state */
common->done = 0;
common->dev_bus_addr = 0;
common->rd = NULL;
common->mfn = INVALID_MFN;
unmap_common(common);
op->status = common->status;
}
static long
gnttab_unmap_and_replace(
XEN_GUEST_HANDLE_PARAM(gnttab_unmap_and_replace_t) uop, unsigned int count)
{
int i, c, partial_done, done = 0;
struct gnttab_unmap_and_replace op;
struct gnttab_unmap_common common[GNTTAB_UNMAP_BATCH_SIZE];
while ( count != 0 )
{
c = min(count, (unsigned int)GNTTAB_UNMAP_BATCH_SIZE);
partial_done = 0;
for ( i = 0; i < c; i++ )
{
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
goto fault;
unmap_and_replace(&op, &common[i]);
++partial_done;
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
goto fault;
guest_handle_add_offset(uop, 1);
}
gnttab_flush_tlb(current->domain);
for ( i = 0; i < partial_done; i++ )
unmap_common_complete(&common[i]);
count -= c;
done += c;
if ( count && hypercall_preempt_check() )
return done;
}
return 0;
fault:
gnttab_flush_tlb(current->domain);
for ( i = 0; i < partial_done; i++ )
unmap_common_complete(&common[i]);
return -EFAULT;
}
static int
gnttab_populate_status_frames(struct domain *d, struct grant_table *gt,
unsigned int req_nr_frames)
{
unsigned int i;
unsigned int req_status_frames;
req_status_frames = grant_to_status_frames(req_nr_frames);
/* Make sure, prior version checks are architectural visible */
block_speculation();
for ( i = nr_status_frames(gt); i < req_status_frames; i++ )
{
if ( (gt->status[i] = alloc_xenheap_page()) == NULL )
goto status_alloc_failed;
clear_page(gt->status[i]);
}
/* Share the new status frames with the recipient domain */
for ( i = nr_status_frames(gt); i < req_status_frames; i++ )
share_xen_page_with_guest(virt_to_page(gt->status[i]), d, SHARE_rw);
gt->nr_status_frames = req_status_frames;
return 0;
status_alloc_failed:
for ( i = nr_status_frames(gt); i < req_status_frames; i++ )
{
free_xenheap_page(gt->status[i]);
gt->status[i] = NULL;
}
return -ENOMEM;
}
static int
gnttab_unpopulate_status_frames(struct domain *d, struct grant_table *gt)
{
unsigned int i;
/* Make sure, prior version checks are architectural visible */
block_speculation();
for ( i = 0; i < nr_status_frames(gt); i++ )
{
struct page_info *pg = virt_to_page(gt->status[i]);
gfn_t gfn = gnttab_get_frame_gfn(gt, true, i);
/*
* For translated domains, recovering from failure after partial
* changes were made is more complicated than it seems worth
* implementing at this time. Hence respective error paths below
* crash the domain in such a case.
*/
if ( paging_mode_translate(d) )
{
int rc = gfn_eq(gfn, INVALID_GFN)
? 0
: gnttab_set_frame_gfn(gt, true, i, INVALID_GFN,
page_to_mfn(pg));
if ( rc )
{
gprintk(XENLOG_ERR,
"Could not remove status frame %u (GFN %#lx) from P2M\n",
i, gfn_x(gfn));
domain_crash(d);
return rc;
}
}
BUG_ON(page_get_owner(pg) != d);
put_page_alloc_ref(pg);
if ( pg->count_info & ~PGC_xen_heap )
{
if ( paging_mode_translate(d) )
{
gprintk(XENLOG_ERR,
"Wrong page state %#lx of status frame %u (GFN %#lx)\n",
pg->count_info, i, gfn_x(gfn));
domain_crash(d);
}
else
{
if ( get_page(pg, d) )
set_bit(_PGC_allocated, &pg->count_info);
while ( i-- )
share_xen_page_with_guest(virt_to_page(gt->status[i]),
d, SHARE_rw);
}
return -EBUSY;
}
page_set_owner(pg, NULL);
}
for ( i = 0; i < nr_status_frames(gt); i++ )
{
free_xenheap_page(gt->status[i]);
gt->status[i] = NULL;
}
gt->nr_status_frames = 0;
return 0;
}
/*
* Grow the grant table. The caller must hold the grant table's
* write lock before calling this function.
*/
static int
gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
{
struct grant_table *gt = d->grant_table;
unsigned int i, j;
if ( req_nr_frames < INITIAL_NR_GRANT_FRAMES )
req_nr_frames = INITIAL_NR_GRANT_FRAMES;
ASSERT(req_nr_frames <= gt->max_grant_frames);
if ( req_nr_frames > INITIAL_NR_GRANT_FRAMES )
gdprintk(XENLOG_INFO,
"Expanding d%d grant table from %u to %u frames\n",
d->domain_id, nr_grant_frames(gt), req_nr_frames);
/* Active */
for ( i = nr_active_grant_frames(gt);
i < num_act_frames_from_sha_frames(req_nr_frames); i++ )
{
if ( (gt->active[i] = alloc_xenheap_page()) == NULL )
goto active_alloc_failed;
clear_page(gt->active[i]);
for ( j = 0; j < ACGNT_PER_PAGE; j++ )
spin_lock_init(>->active[i][j].lock);
}
/* Shared */
for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ )
{
if ( (gt->shared_raw[i] = alloc_xenheap_page()) == NULL )
goto shared_alloc_failed;
clear_page(gt->shared_raw[i]);
}
/* Status pages - version 2 */
if ( evaluate_nospec(gt->gt_version > 1) )
{
if ( gnttab_populate_status_frames(d, gt, req_nr_frames) )
goto shared_alloc_failed;
}
/* Share the new shared frames with the recipient domain */
for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ )
share_xen_page_with_guest(virt_to_page(gt->shared_raw[i]), d, SHARE_rw);
gt->nr_grant_frames = req_nr_frames;
return 0;
shared_alloc_failed:
for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ )
{
free_xenheap_page(gt->shared_raw[i]);
gt->shared_raw[i] = NULL;
}
active_alloc_failed:
for ( i = nr_active_grant_frames(gt);
i < num_act_frames_from_sha_frames(req_nr_frames); i++ )
{
free_xenheap_page(gt->active[i]);
gt->active[i] = NULL;
}
gdprintk(XENLOG_INFO, "Allocation failure when expanding d%d grant table\n",
d->domain_id);
return -ENOMEM;
}
int grant_table_init(struct domain *d, int max_grant_frames,
int max_maptrack_frames, unsigned int options)
{
struct grant_table *gt;
unsigned int max_grant_version = options & XEN_DOMCTL_GRANT_version_mask;
int ret = -ENOMEM;
if ( !max_grant_version )
{
dprintk(XENLOG_INFO, "%pd: invalid grant table version 0 requested\n",
d);
return -EINVAL;
}
if ( max_grant_version > opt_gnttab_max_version )
{
dprintk(XENLOG_INFO,
"%pd: requested grant version (%u) greater than supported (%u)\n",
d, max_grant_version, opt_gnttab_max_version);
return -EINVAL;
}
/* Apply defaults if no value was specified */
if ( max_grant_frames < 0 )
max_grant_frames = opt_max_grant_frames;
if ( max_maptrack_frames < 0 )
max_maptrack_frames = opt_max_maptrack_frames;
if ( max_grant_frames < INITIAL_NR_GRANT_FRAMES )
{
dprintk(XENLOG_INFO, "Bad grant table size: %u frames\n", max_grant_frames);
return -EINVAL;
}
if ( (gt = xzalloc(struct grant_table)) == NULL )
return -ENOMEM;
/* Simple stuff. */
percpu_rwlock_resource_init(>->lock, grant_rwlock);
spin_lock_init(>->maptrack_lock);
gt->gt_version = 1;
gt->max_grant_frames = max_grant_frames;
gt->max_maptrack_frames = max_maptrack_frames;
gt->max_version = max_grant_version;
/* Install the structure early to simplify the error path. */
gt->domain = d;
d->grant_table = gt;
/* Active grant table. */
gt->active = xzalloc_array(struct active_grant_entry *,
max_nr_active_grant_frames(gt));
if ( gt->active == NULL )
goto out;
/* Tracking of mapped foreign frames table */
if ( gt->max_maptrack_frames )
{
gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack));
if ( gt->maptrack == NULL )
goto out;
radix_tree_init(>->maptrack_tree);
}
/* Shared grant table. */
gt->shared_raw = xzalloc_array(void *, gt->max_grant_frames);
if ( gt->shared_raw == NULL )
goto out;
/* Status pages for grant table - for version 2 */
gt->status = xzalloc_array(grant_status_t *,
grant_to_status_frames(gt->max_grant_frames));
if ( gt->status == NULL )
goto out;
grant_write_lock(gt);
/* gnttab_grow_table() allocates a min number of frames, so 0 is okay. */
ret = gnttab_grow_table(d, 0);
grant_write_unlock(gt);
out:
if ( ret )
grant_table_destroy(d);
return ret;
}
static long
gnttab_setup_table(
XEN_GUEST_HANDLE_PARAM(gnttab_setup_table_t) uop, unsigned int count,
unsigned int limit_max)
{
struct vcpu *curr = current;
struct gnttab_setup_table op;
struct domain *d = NULL;
struct grant_table *gt;
unsigned int i;
if ( count != 1 )
return -EINVAL;
if ( unlikely(copy_from_guest(&op, uop, 1)) )
return -EFAULT;
if ( !guest_handle_okay(op.frame_list, op.nr_frames) )
return -EFAULT;
d = rcu_lock_domain_by_any_id(op.dom);
if ( d == NULL )
{
op.status = GNTST_bad_domain;
goto out;
}
if ( xsm_grant_setup(XSM_TARGET, curr->domain, d) )
{
op.status = GNTST_permission_denied;
goto out;
}
gt = d->grant_table;
grant_write_lock(gt);
if ( unlikely(op.nr_frames > gt->max_grant_frames) )
{
gdprintk(XENLOG_INFO, "d%d is limited to %u grant-table frames\n",
d->domain_id, gt->max_grant_frames);
op.status = GNTST_general_error;
goto unlock;
}
if ( unlikely(limit_max < op.nr_frames) )
{
gdprintk(XENLOG_WARNING, "nr_frames for d%d is too large (%u,%u)\n",
d->domain_id, op.nr_frames, limit_max);
op.status = GNTST_general_error;
goto unlock;
}
if ( (op.nr_frames > nr_grant_frames(gt) ||
((gt->gt_version > 1) &&
(grant_to_status_frames(op.nr_frames) > nr_status_frames(gt)))) &&
gnttab_grow_table(d, op.nr_frames) )
{
gdprintk(XENLOG_INFO,
"Expand grant table of d%d to %u failed. Current: %u Max: %u\n",
d->domain_id, op.nr_frames, nr_grant_frames(gt),
gt->max_grant_frames);
op.status = GNTST_general_error;
goto unlock;
}
op.status = GNTST_okay;
for ( i = 0; i < op.nr_frames; i++ )
{
xen_pfn_t gmfn = gfn_x(gnttab_shared_gfn(d, gt, i));
/* Grant tables cannot be shared */
BUG_ON(SHARED_M2P(gmfn));
if ( __copy_to_guest_offset(op.frame_list, i, &gmfn, 1) )
{
op.status = GNTST_bad_virt_addr;
break;
}
}
unlock:
grant_write_unlock(gt);
out:
if ( d )
rcu_unlock_domain(d);
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
return -EFAULT;
return 0;
}
static long
gnttab_query_size(
XEN_GUEST_HANDLE_PARAM(gnttab_query_size_t) uop, unsigned int count)
{
struct gnttab_query_size op;
struct domain *d;
struct grant_table *gt;
if ( count != 1 )
return -EINVAL;
if ( unlikely(copy_from_guest(&op, uop, 1)) )
return -EFAULT;
d = rcu_lock_domain_by_any_id(op.dom);
if ( d == NULL )
{
op.status = GNTST_bad_domain;
goto out;
}
if ( xsm_grant_query_size(XSM_TARGET, current->domain, d) )
{
op.status = GNTST_permission_denied;
goto out;
}
gt = d->grant_table;
grant_read_lock(gt);
op.nr_frames = nr_grant_frames(gt);
op.max_nr_frames = gt->max_grant_frames;
op.status = GNTST_okay;
grant_read_unlock(gt);
out:
if ( d )
rcu_unlock_domain(d);
if ( unlikely(__copy_to_guest(uop, &op, 1)) )
return -EFAULT;
return 0;
}
/*
* Check that the given grant reference (rd,ref) allows 'ld' to transfer
* ownership of a page frame. If so, lock down the grant entry.
*/
static int
gnttab_prepare_for_transfer(
struct domain *rd, struct domain *ld, grant_ref_t ref)
{
struct grant_table *rgt = rd->grant_table;
uint32_t *raw_shah;
union grant_combo scombo;
int retries = 0;
grant_read_lock(rgt);
if ( unlikely(ref >= nr_grant_entries(rgt)) )
{
gdprintk(XENLOG_INFO,
"Bad grant reference %#x for transfer to d%d\n",
ref, rd->domain_id);
goto fail;
}
/* This call also ensures the above check cannot be passed speculatively */
raw_shah = (uint32_t *)shared_entry_header(rgt, ref);
scombo.raw = ACCESS_ONCE(*raw_shah);
for ( ; ; )
{
union grant_combo prev, new;
if ( unlikely(scombo.flags != GTF_accept_transfer) ||
unlikely(scombo.domid != ld->domain_id) )
{
gdprintk(XENLOG_INFO,
"Bad flags (%x) or dom (%d); expected d%d\n",
scombo.flags, scombo.domid, ld->domain_id);
goto fail;
}
new = scombo;
new.flags |= GTF_transfer_committed;
prev.raw = guest_cmpxchg(rd, raw_shah, scombo.raw, new.raw);
if ( likely(prev.raw == scombo.raw) )
break;
if ( retries++ == 4 )
{
gdprintk(XENLOG_WARNING, "Shared grant entry is unstable\n");
goto fail;
}
scombo = prev;
}
grant_read_unlock(rgt);
return 1;
fail:
grant_read_unlock(rgt);
return 0;
}
static long
gnttab_transfer(
XEN_GUEST_HANDLE_PARAM(gnttab_transfer_t) uop, unsigned int count)
{
struct domain *d = current->domain;
struct domain *e;
struct page_info *page;
int i;
struct gnttab_transfer gop;
mfn_t mfn;
unsigned int max_bitsize;
struct active_grant_entry *act;
if ( !opt_grant_transfer )
return -EOPNOTSUPP;
for ( i = 0; i < count; i++ )
{
bool_t okay;
int rc;
if ( i && hypercall_preempt_check() )
return i;
/* Read from caller address space. */
if ( unlikely(__copy_from_guest(&gop, uop, 1)) )
{
gdprintk(XENLOG_INFO, "error reading req %d/%u\n",
i, count);
return -EFAULT;
}
#ifdef CONFIG_X86
{
p2m_type_t p2mt;
mfn = get_gfn_unshare(d, gop.mfn, &p2mt);
if ( p2m_is_shared(p2mt) || !p2m_is_valid(p2mt) )
mfn = INVALID_MFN;
}
#else
mfn = gfn_to_mfn(d, _gfn(gop.mfn));
#endif
/* Check the passed page frame for basic validity. */
if ( unlikely(!mfn_valid(mfn)) )
{
#ifdef CONFIG_X86
put_gfn(d, gop.mfn);
#endif
gdprintk(XENLOG_INFO, "out-of-range %lx\n", (unsigned long)gop.mfn);
gop.status = GNTST_bad_page;
goto copyback;
}
page = mfn_to_page(mfn);
if ( (rc = steal_page(d, page, 0)) < 0 )
{
#ifdef CONFIG_X86
put_gfn(d, gop.mfn);
#endif
gop.status = rc == -EINVAL ? GNTST_bad_page : GNTST_general_error;
goto copyback;
}
rc = guest_physmap_remove_page(d, _gfn(gop.mfn), mfn, 0);
gnttab_flush_tlb(d);
if ( rc )
{
gdprintk(XENLOG_INFO,
"can't remove GFN %"PRI_xen_pfn" (MFN %#"PRI_mfn")\n",
gop.mfn, mfn_x(mfn));
gop.status = GNTST_general_error;
goto put_gfn_and_copyback;
}
/* Find the target domain. */
if ( unlikely((e = rcu_lock_domain_by_id(gop.domid)) == NULL) )
{
gdprintk(XENLOG_INFO, "can't find d%d\n", gop.domid);
gop.status = GNTST_bad_domain;
goto put_gfn_and_copyback;
}
if ( xsm_grant_transfer(XSM_HOOK, d, e) )
{
gop.status = GNTST_permission_denied;
unlock_and_copyback:
rcu_unlock_domain(e);
put_gfn_and_copyback:
#ifdef CONFIG_X86
put_gfn(d, gop.mfn);
#endif
/* The count_info has already been cleaned */
free_domheap_page(page);
goto copyback;
}
max_bitsize = domain_clamp_alloc_bitsize(
e, e->grant_table->gt_version > 1 || paging_mode_translate(e)
? BITS_PER_LONG + PAGE_SHIFT : 32 + PAGE_SHIFT);
if ( max_bitsize < BITS_PER_LONG + PAGE_SHIFT &&
(mfn_x(mfn) >> (max_bitsize - PAGE_SHIFT)) )
{
struct page_info *new_page;
new_page = alloc_domheap_page(e, MEMF_no_owner |
MEMF_bits(max_bitsize));
if ( new_page == NULL )
{
gop.status = GNTST_address_too_big;
goto unlock_and_copyback;
}
copy_domain_page(page_to_mfn(new_page), mfn);
/* The count_info has already been cleared */
free_domheap_page(page);
page = new_page;
mfn = page_to_mfn(page);
}
spin_lock(&e->page_alloc_lock);
/*
* Check that 'e' will accept the page and has reservation
* headroom. Also, a domain mustn't have PGC_allocated
* pages when it is dying.
*/
if ( unlikely(e->is_dying) ||
unlikely(domain_tot_pages(e) >= e->max_pages) ||
unlikely(!(e->tot_pages + 1)) )
{
spin_unlock(&e->page_alloc_lock);
if ( e->is_dying )
gdprintk(XENLOG_INFO, "Transferee d%d is dying\n",
e->domain_id);
else
gdprintk(XENLOG_INFO,
"Transferee %pd has no headroom (tot %u, max %u, ex %u)\n",
e, domain_tot_pages(e), e->max_pages, e->extra_pages);
gop.status = GNTST_general_error;
goto unlock_and_copyback;
}
/* Okay, add the page to 'e'. */
if ( unlikely(domain_adjust_tot_pages(e, 1) == 1) )
get_knownalive_domain(e);
/*
* We must drop the lock to avoid a possible deadlock in
* gnttab_prepare_for_transfer. We have reserved a page in e so can
* safely drop the lock and re-aquire it later to add page to the
* pagelist.
*/
spin_unlock(&e->page_alloc_lock);
okay = gnttab_prepare_for_transfer(e, d, gop.ref);
/*
* Make sure the reference bound check in gnttab_prepare_for_transfer
* is respected and speculative execution is blocked accordingly
*/
if ( unlikely(!evaluate_nospec(okay)) ||
unlikely(assign_pages(page, 1, e, MEMF_no_refcount)) )
{
bool drop_dom_ref;
/*
* Need to grab this again to safely free our "reserved"
* page in the page total
*/
spin_lock(&e->page_alloc_lock);
drop_dom_ref = !domain_adjust_tot_pages(e, -1);
spin_unlock(&e->page_alloc_lock);
if ( okay /* i.e. e->is_dying due to the surrounding if() */ )
gdprintk(XENLOG_INFO, "Transferee d%d is now dying\n",
e->domain_id);
if ( drop_dom_ref )
put_domain(e);
gop.status = GNTST_general_error;
goto unlock_and_copyback;
}
#ifdef CONFIG_X86
put_gfn(d, gop.mfn);
#endif
TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
/* Tell the guest about its new page frame. */
grant_read_lock(e->grant_table);
act = active_entry_acquire(e->grant_table, gop.ref);
if ( evaluate_nospec(e->grant_table->gt_version == 1) )
{
grant_entry_v1_t *sha = &shared_entry_v1(e->grant_table, gop.ref);
rc = guest_physmap_add_page(e, _gfn(sha->frame), mfn, 0);
if ( !paging_mode_translate(e) )
sha->frame = mfn_x(mfn);
}
else
{
grant_entry_v2_t *sha = &shared_entry_v2(e->grant_table, gop.ref);
rc = guest_physmap_add_page(e, _gfn(sha->full_page.frame), mfn, 0);
if ( !paging_mode_translate(e) )
sha->full_page.frame = mfn_x(mfn);
}
smp_wmb();
shared_entry_header(e->grant_table, gop.ref)->flags |=
GTF_transfer_completed;
active_entry_release(act);
grant_read_unlock(e->grant_table);
rcu_unlock_domain(e);
gop.status = rc ? GNTST_general_error : GNTST_okay;
copyback:
if ( unlikely(__copy_field_to_guest(uop, &gop, status)) )
{
gdprintk(XENLOG_INFO, "error writing resp %d/%u\n", i, count);
return -EFAULT;
}
guest_handle_add_offset(uop, 1);
}
return 0;
}
/*
* Undo acquire_grant_for_copy(). This has no effect on page type and
* reference counts.
*/
static void
release_grant_for_copy(
struct domain *rd, grant_ref_t gref, bool readonly)
{
struct grant_table *rgt = rd->grant_table;
grant_entry_header_t *sha;
struct active_grant_entry *act;
mfn_t mfn;
uint16_t *status;
grant_ref_t trans_gref;
struct domain *td;
grant_read_lock(rgt);
act = active_entry_acquire(rgt, gref);
sha = shared_entry_header(rgt, gref);
mfn = act->mfn;
if ( evaluate_nospec(rgt->gt_version == 1) )
{
status = &sha->flags;
td = rd;
trans_gref = gref;
}
else
{
status = &status_entry(rgt, gref);
td = (act->src_domid == rd->domain_id)
? rd : knownalive_domain_from_domid(act->src_domid);
trans_gref = act->trans_gref;
}
if ( readonly )
{
act->pin -= GNTPIN_hstr_inc;
}
else
{
gnttab_mark_dirty(rd, mfn);
act->pin -= GNTPIN_hstw_inc;
}
reduce_status_for_pin(rd, act, status, readonly);
active_entry_release(act);
grant_read_unlock(rgt);
if ( td != rd )
{
/*
* Recursive call, but it is bounded (acquire permits only a single
* level of transitivity), so it's okay.
*/
release_grant_for_copy(td, trans_gref, readonly);
rcu_unlock_domain(td);
}
}
/*
* Grab a machine frame number from a grant entry and update the flags
* and pin count as appropriate. If rc == GNTST_okay, note that this *does*
* take one ref count on the target page, stored in *page.
* If there is any error, *page = NULL, no ref taken.
*/
static int
acquire_grant_for_copy(
struct domain *rd, grant_ref_t gref, domid_t ldom, bool readonly,
mfn_t *mfn, struct page_info **page, uint16_t *page_off,
uint16_t *length, bool allow_transitive)
{
struct grant_table *rgt = rd->grant_table;
grant_entry_v2_t *sha2;
grant_entry_header_t *shah;
struct active_grant_entry *act;
grant_status_t *status;
uint32_t old_pin;
domid_t trans_domid;
grant_ref_t trans_gref;
struct domain *td;
mfn_t grant_mfn;
uint16_t trans_page_off;
uint16_t trans_length;
bool is_sub_page;
s16 rc = GNTST_okay;
unsigned int pin_incr = readonly ? GNTPIN_hstr_inc : GNTPIN_hstw_inc;
*page = NULL;
grant_read_lock(rgt);
if ( unlikely(gref >= nr_grant_entries(rgt)) )
PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
"Bad grant reference %#x\n", gref);
/* This call also ensures the above check cannot be passed speculatively */
shah = shared_entry_header(rgt, gref);
act = active_entry_acquire(rgt, gref);
/* If already pinned, check the active domid and avoid refcnt overflow. */
if ( act->pin &&
((act->domid != ldom) ||
(act->pin & GNTPIN_incr2oflow_mask(pin_incr))) )
PIN_FAIL(unlock_out, GNTST_general_error,
"Bad domain (%d != %d), or risk of counter overflow %08x\n",
act->domid, ldom, act->pin);
if ( evaluate_nospec(rgt->gt_version == 1) )
{
sha2 = NULL;
status = &shah->flags;
}
else
{
sha2 = &shared_entry_v2(rgt, gref);
status = &status_entry(rgt, gref);
}
old_pin = act->pin;
if ( sha2 && (shah->flags & GTF_type_mask) == GTF_transitive )
{
if ( (!old_pin || (!readonly &&
!(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)))) &&
(rc = _set_status_v2(shah, status, rd, act, readonly, 0,
ldom)) != GNTST_okay )
goto unlock_out;
if ( !allow_transitive )
PIN_FAIL(unlock_out_clear, GNTST_general_error,
"transitive grant when transitivity not allowed\n");
trans_domid = sha2->transitive.trans_domid;
trans_gref = sha2->transitive.gref;
barrier(); /* Stop the compiler from re-loading
trans_domid from shared memory */
if ( trans_domid == rd->domain_id )
PIN_FAIL(unlock_out_clear, GNTST_general_error,
"transitive grants cannot be self-referential\n");
/*
* We allow the trans_domid == ldom case, which corresponds to a
* grant being issued by one domain, sent to another one, and then
* transitively granted back to the original domain. Allowing it
* is easy, and means that you don't need to go out of your way to
* avoid it in the guest.
*/
/* We need to leave the rrd locked during the grant copy. */
td = rcu_lock_domain_by_id(trans_domid);
if ( td == NULL )
PIN_FAIL(unlock_out_clear, GNTST_general_error,
"transitive grant referenced bad domain %d\n",
trans_domid);
/*
* acquire_grant_for_copy() will take the lock on the remote table,
* so we have to drop the lock here and reacquire.
*/
active_entry_release(act);
grant_read_unlock(rgt);
rc = acquire_grant_for_copy(td, trans_gref, rd->domain_id,
readonly, &grant_mfn, page,
&trans_page_off, &trans_length,
false);
grant_read_lock(rgt);
act = active_entry_acquire(rgt, gref);
if ( rc != GNTST_okay )
{
rcu_unlock_domain(td);
reduce_status_for_pin(rd, act, status, readonly);
active_entry_release(act);
grant_read_unlock(rgt);
return rc;
}
/*
* We dropped the lock, so we have to check that the grant didn't
* change, and that nobody else tried to pin/unpin it. If anything
* changed, just give up and tell the caller to retry.
*/
if ( rgt->gt_version != 2 ||
act->pin != old_pin ||
(old_pin && (act->domid != ldom ||
!mfn_eq(act->mfn, grant_mfn) ||
act->start != trans_page_off ||
act->length != trans_length ||
act->src_domid != td->domain_id ||
act->trans_gref != trans_gref ||
!act->is_sub_page)) )
{
/*
* Like above for acquire_grant_for_copy() we need to drop and then
* re-acquire the locks here to prevent lock order inversion issues.
* Unlike for acquire_grant_for_copy() we don't need to re-check
* anything, as release_grant_for_copy() doesn't depend on the grant
* table entry: It only updates internal state and the status flags.
*/
active_entry_release(act);
grant_read_unlock(rgt);
release_grant_for_copy(td, trans_gref, readonly);
rcu_unlock_domain(td);
grant_read_lock(rgt);
act = active_entry_acquire(rgt, gref);
reduce_status_for_pin(rd, act, status, readonly);
active_entry_release(act);
grant_read_unlock(rgt);
put_page(*page);
*page = NULL;
return ERESTART;
}
if ( !old_pin )
{
act->domid = ldom;
act->start = trans_page_off;
act->length = trans_length;
act->src_domid = td->domain_id;
act->trans_gref = trans_gref;
act->mfn = grant_mfn;
act_set_gfn(act, INVALID_GFN);
/*
* The actual remote remote grant may or may not be a sub-page,
* but we always treat it as one because that blocks mappings of
* transitive grants.
*/
act->is_sub_page = true;
}
}
else if ( !old_pin ||
(!readonly && !(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask))) )
{
if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act,
readonly, 0, ldom)) != GNTST_okay )
goto unlock_out;
td = rd;
trans_gref = gref;
if ( !sha2 )
{
unsigned long gfn = shared_entry_v1(rgt, gref).frame;
rc = get_paged_frame(gfn, &grant_mfn, page, readonly, rd);
if ( rc != GNTST_okay )
goto unlock_out_clear;
act_set_gfn(act, _gfn(gfn));
is_sub_page = false;
trans_page_off = 0;
trans_length = PAGE_SIZE;
}
else if ( !(sha2->hdr.flags & GTF_sub_page) )
{
rc = get_paged_frame(sha2->full_page.frame, &grant_mfn, page,
readonly, rd);
if ( rc != GNTST_okay )
goto unlock_out_clear;
act_set_gfn(act, _gfn(sha2->full_page.frame));
is_sub_page = false;
trans_page_off = 0;
trans_length = PAGE_SIZE;
}
else
{
rc = get_paged_frame(sha2->sub_page.frame, &grant_mfn, page,
readonly, rd);
if ( rc != GNTST_okay )
goto unlock_out_clear;
act_set_gfn(act, _gfn(sha2->sub_page.frame));
is_sub_page = true;
trans_page_off = sha2->sub_page.page_off;
trans_length = sha2->sub_page.length;
}
if ( !act->pin )
{
act->domid = ldom;
act->is_sub_page = is_sub_page;
act->start = trans_page_off;
act->length = trans_length;
act->src_domid = td->domain_id;
act->trans_gref = trans_gref;
act->mfn = grant_mfn;
}
}
else
{
ASSERT(mfn_valid(act->mfn));
*page = mfn_to_page(act->mfn);
td = page_get_owner_and_reference(*page);
/*
* act->pin being non-zero should guarantee the page to have a
* non-zero refcount and hence a valid owner (matching the one on
* record), with one exception: If the owning domain is dying we
* had better not make implications from pin count (map_grant_ref()
* updates pin counts before obtaining page references, for
* example).
*/
if ( td != rd || rd->is_dying )
{
if ( td )
put_page(*page);
*page = NULL;
rc = GNTST_bad_domain;
goto unlock_out_clear;
}
}
act->pin += pin_incr;
*page_off = act->start;
*length = act->length;
*mfn = act->mfn;
active_entry_release(act);
grant_read_unlock(rgt);
return rc;
unlock_out_clear:
reduce_status_for_pin(rd, act, status, readonly);
unlock_out:
active_entry_release(act);
gt_unlock_out:
grant_read_unlock(rgt);
return rc;
}
struct gnttab_copy_buf {
/* Guest provided. */
struct gnttab_copy_ptr ptr;
uint16_t len;
/* Mapped etc. */
struct domain *domain;
mfn_t mfn;
struct page_info *page;
void *virt;
bool_t read_only;
bool_t have_grant;
bool_t have_type;
};
static int gnttab_copy_lock_domain(domid_t domid, bool is_gref,
struct gnttab_copy_buf *buf)
{
/* Only DOMID_SELF may reference via frame. */
if ( domid != DOMID_SELF && !is_gref )
return GNTST_permission_denied;
buf->domain = rcu_lock_domain_by_any_id(domid);
if ( !buf->domain )
return GNTST_bad_domain;
buf->ptr.domid = domid;
return GNTST_okay;
}
static void gnttab_copy_unlock_domains(struct gnttab_copy_buf *src,
struct gnttab_copy_buf *dest)
{
if ( src->domain )
{
rcu_unlock_domain(src->domain);
src->domain = NULL;
}
if ( dest->domain )
{
rcu_unlock_domain(dest->domain);
dest->domain = NULL;
}
}
static int gnttab_copy_lock_domains(const struct gnttab_copy *op,
struct gnttab_copy_buf *src,
struct gnttab_copy_buf *dest)
{
int rc;
rc = gnttab_copy_lock_domain(op->source.domid,
op->flags & GNTCOPY_source_gref, src);
if ( rc < 0 )
goto error;
rc = gnttab_copy_lock_domain(op->dest.domid,
op->flags & GNTCOPY_dest_gref, dest);
if ( rc < 0 )
goto error;
rc = xsm_grant_copy(XSM_HOOK, src->domain, dest->domain);
if ( rc < 0 )
{
rc = GNTST_permission_denied;
goto error;
}
return 0;
error:
gnttab_copy_unlock_domains(src, dest);
return rc;
}
static void gnttab_copy_release_buf(struct gnttab_copy_buf *buf)
{
if ( buf->virt )
{
unmap_domain_page(buf->virt);
buf->virt = NULL;
}
if ( buf->have_grant )
{
release_grant_for_copy(buf->domain, buf->ptr.u.ref, buf->read_only);
buf->have_grant = 0;
}
if ( buf->have_type )
{
put_page_type(buf->page);
buf->have_type = 0;
}
if ( buf->page )
{
put_page(buf->page);
buf->page = NULL;
}
}
static int gnttab_copy_claim_buf(const struct gnttab_copy *op,
const struct gnttab_copy_ptr *ptr,
struct gnttab_copy_buf *buf,
unsigned int gref_flag)
{
int rc;
buf->read_only = gref_flag == GNTCOPY_source_gref;
if ( op->flags & gref_flag )
{
rc = acquire_grant_for_copy(buf->domain, ptr->u.ref,
current->domain->domain_id,
buf->read_only,
&buf->mfn, &buf->page,
&buf->ptr.offset, &buf->len,
opt_transitive_grants);
if ( rc != GNTST_okay )
goto out;
buf->ptr.u.ref = ptr->u.ref;
buf->have_grant = 1;
}
else
{
rc = get_paged_frame(ptr->u.gmfn, &buf->mfn, &buf->page,
buf->read_only, buf->domain);
if ( rc != GNTST_okay )
PIN_FAIL(out, rc,
"source frame %"PRI_xen_pfn" invalid\n", ptr->u.gmfn);
buf->ptr.u.gmfn = ptr->u.gmfn;
buf->ptr.offset = 0;
buf->len = PAGE_SIZE;
}
if ( !buf->read_only )
{
if ( !get_page_type(buf->page, PGT_writable_page) )
{
if ( !buf->domain->is_dying )
gdprintk(XENLOG_WARNING,
"Could not get writable frame %#"PRI_mfn"\n",
mfn_x(buf->mfn));
rc = GNTST_general_error;
goto out;
}
buf->have_type = 1;
}
buf->virt = map_domain_page(buf->mfn);
rc = GNTST_okay;
out:
return rc;
}
static bool_t gnttab_copy_buf_valid(const struct gnttab_copy_ptr *p,
const struct gnttab_copy_buf *b,
bool_t has_gref)
{
if ( !b->virt )
return 0;
if ( has_gref )
return b->have_grant && p->u.ref == b->ptr.u.ref;
return p->u.gmfn == b->ptr.u.gmfn;
}
static int gnttab_copy_buf(const struct gnttab_copy *op,
struct gnttab_copy_buf *dest,
const struct gnttab_copy_buf *src)
{
int rc;
if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
((op->dest.offset + op->len) > PAGE_SIZE) )
PIN_FAIL(out, GNTST_bad_copy_arg, "copy beyond page area\n");
if ( op->source.offset < src->ptr.offset ||
op->source.offset + op->len > src->ptr.offset + src->len )
PIN_FAIL(out, GNTST_general_error,
"copy source out of bounds: %d < %d || %d > %d\n",
op->source.offset, src->ptr.offset,
op->len, src->len);
if ( op->dest.offset < dest->ptr.offset ||
op->dest.offset + op->len > dest->ptr.offset + dest->len )
PIN_FAIL(out, GNTST_general_error,
"copy dest out of bounds: %d < %d || %d > %d\n",
op->dest.offset, dest->ptr.offset,
op->len, dest->len);
/* Make sure the above checks are not bypassed speculatively */
block_speculation();
memcpy(dest->virt + op->dest.offset, src->virt + op->source.offset,
op->len);
gnttab_mark_dirty(dest->domain, dest->mfn);
rc = GNTST_okay;
out:
return rc;
}
static int gnttab_copy_one(const struct gnttab_copy *op,
struct gnttab_copy_buf *dest,
struct gnttab_copy_buf *src)
{
int rc;
if ( !src->domain || op->source.domid != src->ptr.domid ||
!dest->domain || op->dest.domid != dest->ptr.domid )
{
gnttab_copy_release_buf(src);
gnttab_copy_release_buf(dest);
gnttab_copy_unlock_domains(src, dest);
rc = gnttab_copy_lock_domains(op, src, dest);
if ( rc < 0 )
goto out;
}
/* Different source? */
if ( !gnttab_copy_buf_valid(&op->source, src,
op->flags & GNTCOPY_source_gref) )
{
gnttab_copy_release_buf(src);
rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
if ( rc )
goto out;
}
/* Different dest? */
if ( !gnttab_copy_buf_valid(&op->dest, dest,
op->flags & GNTCOPY_dest_gref) )
{
gnttab_copy_release_buf(dest);
rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
if ( rc )
goto out;
}
rc = gnttab_copy_buf(op, dest, src);
out:
return rc;
}
/*
* gnttab_copy(), other than the various other helpers of
* do_grant_table_op(), returns (besides possible error indicators)
* "count - i" rather than "i" to ensure that even if no progress
* was made at all (perhaps due to gnttab_copy_one() returning a
* positive value) a non-zero value is being handed back (zero needs
* to be avoided, as that means "success, all done").
*/
static long gnttab_copy(
XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count)
{
unsigned int i;
struct gnttab_copy op;
struct gnttab_copy_buf src = {};
struct gnttab_copy_buf dest = {};
long rc = 0;
for ( i = 0; i < count; i++ )
{
if ( i && hypercall_preempt_check() )
{
rc = count - i;
break;
}
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
{
rc = -EFAULT;
break;
}
rc = gnttab_copy_one(&op, &dest, &src);
if ( rc > 0 )
{
rc = count - i;
break;
}
if ( rc != GNTST_okay )
{
gnttab_copy_release_buf(&src);
gnttab_copy_release_buf(&dest);
}
op.status = rc;
rc = 0;
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
{
rc = -EFAULT;
break;
}
guest_handle_add_offset(uop, 1);
}
gnttab_copy_release_buf(&src);
gnttab_copy_release_buf(&dest);
gnttab_copy_unlock_domains(&src, &dest);
return rc;
}
static long
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
{
gnttab_set_version_t op;
struct domain *currd = current->domain;
struct grant_table *gt = currd->grant_table;
grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES];
int res;
unsigned int i, nr_ents;
if ( copy_from_guest(&op, uop, 1) )
return -EFAULT;
res = -EINVAL;
if ( op.version != 1 && op.version != 2 )
goto out;
res = -ENOSYS;
if ( op.version == 2 && gt->max_version == 1 )
goto out; /* Behave as before set_version was introduced. */
res = 0;
if ( gt->gt_version == op.version )
goto out;
grant_write_lock(gt);
/*
* Make sure that the grant table isn't currently in use when we
* change the version number, except for the first 8 entries which
* are allowed to be in use (xenstore/xenconsole keeps them mapped).
* (You need to change the version number for e.g. kexec.)
*/
nr_ents = nr_grant_entries(gt);
for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_ents; i++ )
{
if ( read_atomic(&_active_entry(gt, i).pin) != 0 )
{
gdprintk(XENLOG_WARNING,
"tried to change grant table version from %u to %u, but some grant entries still in use\n",
gt->gt_version, op.version);
res = -EBUSY;
goto out_unlock;
}
}
switch ( gt->gt_version )
{
case 1:
/* XXX: We could maybe shrink the active grant table here. */
res = gnttab_populate_status_frames(currd, gt, nr_grant_frames(gt));
if ( res < 0)
goto out_unlock;
break;
case 2:
for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
{
switch ( shared_entry_v2(gt, i).hdr.flags & GTF_type_mask )
{
case GTF_permit_access:
if ( !(shared_entry_v2(gt, i).full_page.frame >> 32) )
break;
/* fall through */
case GTF_transitive:
gdprintk(XENLOG_WARNING,
"tried to change grant table version to 1 with non-representable entries\n");
res = -ERANGE;
goto out_unlock;
}
}
break;
}
/* Preserve the first 8 entries (toolstack reserved grants). */
switch ( gt->gt_version )
{
case 1:
memcpy(reserved_entries, &shared_entry_v1(gt, 0),
sizeof(reserved_entries));
break;
case 2:
for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
{
unsigned int flags = shared_entry_v2(gt, i).hdr.flags;
switch ( flags & GTF_type_mask )
{
case GTF_permit_access:
reserved_entries[i].flags = flags | status_entry(gt, i);
reserved_entries[i].domid = shared_entry_v2(gt, i).hdr.domid;
reserved_entries[i].frame = shared_entry_v2(gt, i).full_page.frame;
break;
default:
gdprintk(XENLOG_INFO,
"bad flags %#x in grant %#x when switching version\n",
flags, i);
/* fall through */
case GTF_invalid:
memset(&reserved_entries[i], 0, sizeof(reserved_entries[i]));
break;
}
}
break;
}
if ( op.version < 2 && gt->gt_version == 2 &&
(res = gnttab_unpopulate_status_frames(currd, gt)) != 0 )
goto out_unlock;
/* Make sure there's no crud left over from the old version. */
for ( i = 0; i < nr_grant_frames(gt); i++ )
clear_page(gt->shared_raw[i]);
/* Restore the first 8 entries (toolstack reserved grants). */
if ( gt->gt_version )
{
switch ( op.version )
{
case 1:
memcpy(&shared_entry_v1(gt, 0), reserved_entries, sizeof(reserved_entries));
break;
case 2:
for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
{
status_entry(gt, i) =
reserved_entries[i].flags & (GTF_reading | GTF_writing);
shared_entry_v2(gt, i).hdr.flags =
reserved_entries[i].flags & ~(GTF_reading | GTF_writing);
shared_entry_v2(gt, i).hdr.domid =
reserved_entries[i].domid;
shared_entry_v2(gt, i).full_page.frame =
reserved_entries[i].frame;
}
break;
}
}
gt->gt_version = op.version;
out_unlock:
grant_write_unlock(gt);
out:
op.version = gt->gt_version;
if ( __copy_to_guest(uop, &op, 1) )
res = -EFAULT;
return res;
}
static long
gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
unsigned int count)
{
gnttab_get_status_frames_t op;
struct domain *d;
struct grant_table *gt;
uint64_t gmfn;
int i;
int rc;
if ( count != 1 )
return -EINVAL;
if ( unlikely(copy_from_guest(&op, uop, 1) != 0) )
{
gdprintk(XENLOG_INFO,
"Fault while reading gnttab_get_status_frames_t\n");
return -EFAULT;
}
if ( !guest_handle_okay(op.frame_list, op.nr_frames) )
return -EFAULT;
d = rcu_lock_domain_by_any_id(op.dom);
if ( d == NULL )
{
op.status = GNTST_bad_domain;
goto out1;
}
rc = xsm_grant_setup(XSM_TARGET, current->domain, d);
if ( rc )
{
op.status = GNTST_permission_denied;
goto out2;
}
gt = d->grant_table;
op.status = GNTST_okay;
grant_read_lock(gt);
if ( unlikely(op.nr_frames > nr_status_frames(gt)) )
{
gdprintk(XENLOG_INFO, "Requested addresses of d%d for %u grant "
"status frames, but has only %u\n",
d->domain_id, op.nr_frames, nr_status_frames(gt));
op.status = GNTST_general_error;
goto unlock;
}
for ( i = 0; i < op.nr_frames; i++ )
{
gmfn = gfn_x(gnttab_status_gfn(d, gt, i));
if ( __copy_to_guest_offset(op.frame_list, i, &gmfn, 1) )
{
op.status = GNTST_bad_virt_addr;
break;
}
}
unlock:
grant_read_unlock(gt);
out2:
rcu_unlock_domain(d);
out1:
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
return -EFAULT;
return 0;
}
static long
gnttab_get_version(XEN_GUEST_HANDLE_PARAM(gnttab_get_version_t) uop)
{
gnttab_get_version_t op;
struct domain *d;
int rc;
if ( copy_from_guest(&op, uop, 1) )
return -EFAULT;
d = rcu_lock_domain_by_any_id(op.dom);
if ( d == NULL )
return -ESRCH;
rc = xsm_grant_query_size(XSM_TARGET, current->domain, d);
if ( rc )
{
rcu_unlock_domain(d);
return rc;
}
op.version = d->grant_table->gt_version;
rcu_unlock_domain(d);
if ( __copy_field_to_guest(uop, &op, version) )
return -EFAULT;
return 0;
}
static s16
swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
{
struct domain *d = rcu_lock_current_domain();
struct grant_table *gt = d->grant_table;
struct active_grant_entry *act_a = NULL;
struct active_grant_entry *act_b = NULL;
s16 rc = GNTST_okay;
grant_write_lock(gt);
/* Bounds check on the grant refs */
if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-a %#x\n", ref_a);
if ( unlikely(ref_b >= nr_grant_entries(d->grant_table)))
PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b %#x\n", ref_b);
/* Make sure the above checks are not bypassed speculatively */
block_speculation();
/* Swapping the same ref is a no-op. */
if ( ref_a == ref_b )
goto out;
act_a = active_entry_acquire(gt, ref_a);
if ( act_a->pin )
PIN_FAIL(out, GNTST_eagain, "ref a %#x busy\n", ref_a);
act_b = active_entry_acquire(gt, ref_b);
if ( act_b->pin )
PIN_FAIL(out, GNTST_eagain, "ref b %#x busy\n", ref_b);
if ( evaluate_nospec(gt->gt_version == 1) )
{
grant_entry_v1_t shared;
shared = shared_entry_v1(gt, ref_a);
shared_entry_v1(gt, ref_a) = shared_entry_v1(gt, ref_b);
shared_entry_v1(gt, ref_b) = shared;
}
else
{
grant_entry_v2_t shared;
grant_status_t status;
shared = shared_entry_v2(gt, ref_a);
status = status_entry(gt, ref_a);
shared_entry_v2(gt, ref_a) = shared_entry_v2(gt, ref_b);
status_entry(gt, ref_a) = status_entry(gt, ref_b);
shared_entry_v2(gt, ref_b) = shared;
status_entry(gt, ref_b) = status;
}
out:
if ( act_b != NULL )
active_entry_release(act_b);
if ( act_a != NULL )
active_entry_release(act_a);
grant_write_unlock(gt);
rcu_unlock_domain(d);
return rc;
}
static long
gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
unsigned int count)
{
int i;
gnttab_swap_grant_ref_t op;
for ( i = 0; i < count; i++ )
{
if ( i && hypercall_preempt_check() )
return i;
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
return -EFAULT;
op.status = swap_grant_ref(op.ref_a, op.ref_b);
if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
return -EFAULT;
guest_handle_add_offset(uop, 1);
}
return 0;
}
static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
{
struct domain *d, *owner;
struct page_info *page;
mfn_t mfn;
struct active_grant_entry *act = NULL;
void *v;
int ret;
if ( (cflush->offset >= PAGE_SIZE) ||
(cflush->length > PAGE_SIZE) ||
(cflush->offset + cflush->length > PAGE_SIZE) ||
(cflush->op & ~(GNTTAB_CACHE_INVAL | GNTTAB_CACHE_CLEAN)) )
return -EINVAL;
if ( cflush->length == 0 || cflush->op == 0 )
return !*cur_ref ? 0 : -EILSEQ;
/* currently unimplemented */
if ( cflush->op & GNTTAB_CACHE_SOURCE_GREF )
return -EOPNOTSUPP;
d = rcu_lock_current_domain();
mfn = maddr_to_mfn(cflush->a.dev_bus_addr);
if ( !mfn_valid(mfn) )
{
rcu_unlock_domain(d);
return -EINVAL;
}
page = mfn_to_page(mfn);
owner = page_get_owner_and_reference(page);
if ( !owner || !owner->grant_table )
{
rcu_unlock_domain(d);
return -EPERM;
}
if ( d != owner )
{
grant_read_lock(owner->grant_table);
act = grant_map_exists(d, owner->grant_table, mfn, cur_ref);
if ( IS_ERR_OR_NULL(act) )
{
grant_read_unlock(owner->grant_table);
rcu_unlock_domain(d);
put_page(page);
return act ? PTR_ERR(act) : 1;
}
}
v = map_domain_page(mfn);
v += cflush->offset;
if ( (cflush->op & GNTTAB_CACHE_INVAL) && (cflush->op & GNTTAB_CACHE_CLEAN) )
ret = clean_and_invalidate_dcache_va_range(v, cflush->length);
else if ( cflush->op & GNTTAB_CACHE_INVAL )
ret = invalidate_dcache_va_range(v, cflush->length);
else if ( cflush->op & GNTTAB_CACHE_CLEAN )
ret = clean_dcache_va_range(v, cflush->length);
else
ret = 0;
if ( d != owner )
{
active_entry_release(act);
grant_read_unlock(owner->grant_table);
}
unmap_domain_page(v);
put_page(page);
rcu_unlock_domain(d);
return ret;
}
static long
gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
grant_ref_t *cur_ref,
unsigned int count)
{
unsigned int i;
gnttab_cache_flush_t op;
for ( i = 0; i < count; i++ )
{
if ( i && hypercall_preempt_check() )
return i;
if ( unlikely(__copy_from_guest(&op, uop, 1)) )
return -EFAULT;
for ( ; ; )
{
int ret = _cache_flush(&op, cur_ref);
if ( ret < 0 )
return ret;
if ( ret == 0 )
break;
if ( hypercall_preempt_check() )
return i;
}
*cur_ref = 0;
guest_handle_add_offset(uop, 1);
}
*cur_ref = 0;
return 0;
}
long do_grant_table_op(
unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count)
{
long rc;
unsigned int opaque_in = cmd & GNTTABOP_ARG_MASK, opaque_out = 0;
#ifdef CONFIG_PV_SHIM
if ( unlikely(pv_shim) )
return pv_shim_grant_table_op(cmd, uop, count);
#endif
if ( (int)count < 0 )
return -EINVAL;
if ( (cmd &= GNTTABOP_CMD_MASK) != GNTTABOP_cache_flush && opaque_in )
return -EINVAL;
rc = -EFAULT;
switch ( cmd )
{
case GNTTABOP_map_grant_ref:
{
XEN_GUEST_HANDLE_PARAM(gnttab_map_grant_ref_t) map =
guest_handle_cast(uop, gnttab_map_grant_ref_t);
if ( unlikely(!guest_handle_okay(map, count)) )
goto out;
rc = gnttab_map_grant_ref(map, count);
if ( rc > 0 )
{
guest_handle_add_offset(map, rc);
uop = guest_handle_cast(map, void);
}
break;
}
case GNTTABOP_unmap_grant_ref:
{
XEN_GUEST_HANDLE_PARAM(gnttab_unmap_grant_ref_t) unmap =
guest_handle_cast(uop, gnttab_unmap_grant_ref_t);
if ( unlikely(!guest_handle_okay(unmap, count)) )
goto out;
rc = gnttab_unmap_grant_ref(unmap, count);
if ( rc > 0 )
{
guest_handle_add_offset(unmap, rc);
uop = guest_handle_cast(unmap, void);
}
break;
}
case GNTTABOP_unmap_and_replace:
{
XEN_GUEST_HANDLE_PARAM(gnttab_unmap_and_replace_t) unmap =
guest_handle_cast(uop, gnttab_unmap_and_replace_t);
if ( unlikely(!guest_handle_okay(unmap, count)) )
goto out;
rc = gnttab_unmap_and_replace(unmap, count);
if ( rc > 0 )
{
guest_handle_add_offset(unmap, rc);
uop = guest_handle_cast(unmap, void);
}
break;
}
case GNTTABOP_setup_table:
rc = gnttab_setup_table(
guest_handle_cast(uop, gnttab_setup_table_t), count, UINT_MAX);
ASSERT(rc <= 0);
break;
case GNTTABOP_transfer:
{
XEN_GUEST_HANDLE_PARAM(gnttab_transfer_t) transfer =
guest_handle_cast(uop, gnttab_transfer_t);
if ( unlikely(!guest_handle_okay(transfer, count)) )
goto out;
rc = gnttab_transfer(transfer, count);
if ( rc > 0 )
{
guest_handle_add_offset(transfer, rc);
uop = guest_handle_cast(transfer, void);
}
break;
}
case GNTTABOP_copy:
{
XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) copy =
guest_handle_cast(uop, gnttab_copy_t);
if ( unlikely(!guest_handle_okay(copy, count)) )
goto out;
rc = gnttab_copy(copy, count);
if ( rc > 0 )
{
guest_handle_add_offset(copy, count - rc);
uop = guest_handle_cast(copy, void);
}
break;
}
case GNTTABOP_query_size:
rc = gnttab_query_size(
guest_handle_cast(uop, gnttab_query_size_t), count);
ASSERT(rc <= 0);
break;
case GNTTABOP_set_version:
rc = gnttab_set_version(guest_handle_cast(uop, gnttab_set_version_t));
break;
case GNTTABOP_get_status_frames:
rc = gnttab_get_status_frames(
guest_handle_cast(uop, gnttab_get_status_frames_t), count);
break;
case GNTTABOP_get_version:
rc = gnttab_get_version(guest_handle_cast(uop, gnttab_get_version_t));
break;
case GNTTABOP_swap_grant_ref:
{
XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) swap =
guest_handle_cast(uop, gnttab_swap_grant_ref_t);
if ( unlikely(!guest_handle_okay(swap, count)) )
goto out;
rc = gnttab_swap_grant_ref(swap, count);
if ( rc > 0 )
{
guest_handle_add_offset(swap, rc);
uop = guest_handle_cast(swap, void);
}
break;
}
case GNTTABOP_cache_flush:
{
XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) cflush =
guest_handle_cast(uop, gnttab_cache_flush_t);
if ( unlikely(!guest_handle_okay(cflush, count)) )
goto out;
rc = gnttab_cache_flush(cflush, &opaque_in, count);
if ( rc >= 0 )
{
guest_handle_add_offset(cflush, rc);
uop = guest_handle_cast(cflush, void);
opaque_out = opaque_in;
}
break;
}
default:
rc = -ENOSYS;
break;
}
out:
if ( rc > 0 || (opaque_out != 0 && rc == 0) )
{
/* Adjust rc, see gnttab_copy() for why this is needed. */
if ( cmd == GNTTABOP_copy )
rc = count - rc;
ASSERT(rc < count);
ASSERT((opaque_out & GNTTABOP_CMD_MASK) == 0);
rc = hypercall_create_continuation(__HYPERVISOR_grant_table_op, "ihi",
opaque_out | cmd, uop, count - rc);
}
return rc;
}
#ifdef CONFIG_COMPAT
#include "compat/grant_table.c"
#endif
int gnttab_release_mappings(struct domain *d)
{
struct grant_table *gt = d->grant_table, *rgt;
struct grant_mapping *map;
grant_ref_t ref;
grant_handle_t handle;
struct domain *rd;
struct active_grant_entry *act;
grant_entry_header_t *sha;
uint16_t *status;
struct page_info *pg;
BUG_ON(!d->is_dying);
if ( !gt || !gt->maptrack )
return 0;
for ( handle = gt->maptrack_limit; handle; )
{
mfn_t mfn;
/*
* Deal with full pages such that their freeing (in the body of the
* if()) remains simple.
*/
if ( handle < gt->maptrack_limit && !(handle % MAPTRACK_PER_PAGE) )
{
/*
* Changing maptrack_limit alters nr_maptrack_frames()'es return
* value. Free the then excess trailing page right here, rather
* than leaving it to grant_table_destroy() (and in turn requiring
* to leave gt->maptrack_limit unaltered).
*/
gt->maptrack_limit = handle;
FREE_XENHEAP_PAGE(gt->maptrack[nr_maptrack_frames(gt)]);
if ( hypercall_preempt_check() )
return -ERESTART;
}
--handle;
map = &maptrack_entry(gt, handle);
if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) )
continue;
ref = map->ref;
gdprintk(XENLOG_INFO, "Grant release %#x ref %#x flags %#x d%d\n",
handle, ref, map->flags, map->domid);
rd = rcu_lock_domain_by_id(map->domid);
if ( rd == NULL )
{
/* Nothing to clear up... */
map->flags = 0;
continue;
}
rgt = rd->grant_table;
grant_read_lock(rgt);
act = active_entry_acquire(rgt, ref);
sha = shared_entry_header(rgt, ref);
if ( rgt->gt_version == 1 )
status = &sha->flags;
else
status = &status_entry(rgt, ref);
pg = !is_iomem_page(act->mfn) ? mfn_to_page(act->mfn) : NULL;
if ( map->flags & GNTMAP_readonly )
{
if ( map->flags & GNTMAP_device_map )
{
BUG_ON(!(act->pin & GNTPIN_devr_mask));
act->pin -= GNTPIN_devr_inc;
if ( pg )
put_page(pg);
}
if ( map->flags & GNTMAP_host_map )
{
BUG_ON(!(act->pin & GNTPIN_hstr_mask));
act->pin -= GNTPIN_hstr_inc;
if ( pg && gnttab_release_host_mappings(d) )
put_page(pg);
}
}
else
{
if ( map->flags & GNTMAP_device_map )
{
BUG_ON(!(act->pin & GNTPIN_devw_mask));
act->pin -= GNTPIN_devw_inc;
if ( pg )
put_page_and_type(pg);
}
if ( map->flags & GNTMAP_host_map )
{
BUG_ON(!(act->pin & GNTPIN_hstw_mask));
act->pin -= GNTPIN_hstw_inc;
if ( pg && gnttab_release_host_mappings(d) )
{
if ( gnttab_host_mapping_get_page_type(false, d, rd) )
put_page_type(pg);
put_page(pg);
}
}
}
reduce_status_for_pin(rd, act, status, map->flags & GNTMAP_readonly);
mfn = act->mfn;
active_entry_release(act);
grant_read_unlock(rgt);
rcu_unlock_domain(rd);
map->flags = 0;
/*
* This is excessive in that a single such call would suffice per
* mapped MFN (or none at all, if no entry was ever inserted). But it
* should be the common case for an MFN to be mapped just once, and
* this way we don't need to further maintain the counters. We also
* don't want to leave cleaning up of the tree as a whole to the end
* of the function, as this could take quite some time.
*/
radix_tree_delete(>->maptrack_tree, mfn_x(mfn));
}
gt->maptrack_limit = 0;
FREE_XENHEAP_PAGE(gt->maptrack[0]);
radix_tree_destroy(>->maptrack_tree, NULL);
return 0;
}
void grant_table_warn_active_grants(struct domain *d)
{
struct grant_table *gt = d->grant_table;
struct active_grant_entry *act;
grant_ref_t ref;
unsigned int nr_active = 0, nr_ents;
#define WARN_GRANT_MAX 10
grant_read_lock(gt);
nr_ents = nr_grant_entries(gt);
for ( ref = 0; ref != nr_ents; ref++ )
{
act = active_entry_acquire(gt, ref);
if ( !act->pin )
{
active_entry_release(act);
continue;
}
nr_active++;
if ( nr_active <= WARN_GRANT_MAX )
printk(XENLOG_G_DEBUG "d%d has active grant %x ("
#ifndef NDEBUG
"GFN %lx, "
#endif
"MFN: %#"PRI_mfn")\n",
d->domain_id, ref,
#ifndef NDEBUG
gfn_x(act->gfn),
#endif
mfn_x(act->mfn));
active_entry_release(act);
}
if ( nr_active > WARN_GRANT_MAX )
printk(XENLOG_G_DEBUG "d%d has too many (%d) active grants to report\n",
d->domain_id, nr_active);
grant_read_unlock(gt);
#undef WARN_GRANT_MAX
}
void
grant_table_destroy(
struct domain *d)
{
struct grant_table *t = d->grant_table;
int i;
if ( t == NULL )
return;
for ( i = 0; i < nr_grant_frames(t); i++ )
free_xenheap_page(t->shared_raw[i]);
xfree(t->shared_raw);
ASSERT(!t->maptrack_limit);
vfree(t->maptrack);
for ( i = 0; i < nr_active_grant_frames(t); i++ )
free_xenheap_page(t->active[i]);
xfree(t->active);
for ( i = 0; i < nr_status_frames(t); i++ )
free_xenheap_page(t->status[i]);
xfree(t->status);
xfree(t);
d->grant_table = NULL;
}
void grant_table_init_vcpu(struct vcpu *v)
{
spin_lock_init(&v->maptrack_freelist_lock);
v->maptrack_head = MAPTRACK_TAIL;
v->maptrack_tail = MAPTRACK_TAIL;
}
#ifdef CONFIG_MEM_SHARING
int mem_sharing_gref_to_gfn(struct grant_table *gt, grant_ref_t ref,
gfn_t *gfn, uint16_t *status)
{
int rc = 0;
uint16_t flags = 0;
grant_read_lock(gt);
if ( gt->gt_version < 1 )
rc = -EINVAL;
else if ( ref >= nr_grant_entries(gt) )
rc = -ENOENT;
else if ( evaluate_nospec(gt->gt_version == 1) )
{
const grant_entry_v1_t *sha1 = &shared_entry_v1(gt, ref);
flags = sha1->flags;
*gfn = _gfn(sha1->frame);
}
else
{
const grant_entry_v2_t *sha2 = &shared_entry_v2(gt, ref);
flags = sha2->hdr.flags;
if ( flags & GTF_sub_page )
*gfn = _gfn(sha2->sub_page.frame);
else
*gfn = _gfn(sha2->full_page.frame);
}
if ( !rc && (flags & GTF_type_mask) != GTF_permit_access )
rc = -ENXIO;
else if ( !rc && status )
{
if ( evaluate_nospec(gt->gt_version == 1) )
*status = flags;
else
*status = status_entry(gt, ref);
}
grant_read_unlock(gt);
return rc;
}
#endif
/* caller must hold write lock */
static int gnttab_get_status_frame_mfn(struct domain *d,
unsigned int idx, mfn_t *mfn)
{
const struct grant_table *gt = d->grant_table;
ASSERT(gt->gt_version == 2);
/* Make sure we have version equal to 2 even under speculation */
block_speculation();
if ( idx >= nr_status_frames(gt) )
{
unsigned int nr_status;
unsigned int nr_grant;
nr_status = idx + 1; /* sufficient frames to make idx valid */
if ( nr_status == 0 ) /* overflow? */
return -EINVAL;
nr_grant = status_to_grant_frames(nr_status);
if ( grant_to_status_frames(nr_grant) != nr_status ) /* overflow? */
return -EINVAL;
if ( nr_grant <= gt->max_grant_frames )
gnttab_grow_table(d, nr_grant);
/* check whether gnttab_grow_table() succeeded */
if ( idx >= nr_status_frames(gt) )
return -EINVAL;
}
/* Make sure idx is bounded wrt nr_status_frames */
*mfn = _mfn(virt_to_mfn(
gt->status[array_index_nospec(idx, nr_status_frames(gt))]));
return 0;
}
/* caller must hold write lock */
static int gnttab_get_shared_frame_mfn(struct domain *d,
unsigned int idx, mfn_t *mfn)
{
const struct grant_table *gt = d->grant_table;
ASSERT(gt->gt_version != 0);
if ( idx >= nr_grant_frames(gt) )
{
unsigned int nr_grant;
nr_grant = idx + 1; /* sufficient frames to make idx valid */
if ( nr_grant == 0 ) /* overflow? */
return -EINVAL;
if ( nr_grant <= gt->max_grant_frames )
gnttab_grow_table(d, nr_grant);
/* check whether gnttab_grow_table() succeeded */
if ( idx >= nr_grant_frames(gt) )
return -EINVAL;
}
/* Make sure idx is bounded wrt nr_status_frames */
*mfn = _mfn(virt_to_mfn(
gt->shared_raw[array_index_nospec(idx, nr_grant_frames(gt))]));
return 0;
}
unsigned int gnttab_resource_max_frames(const struct domain *d, unsigned int id)
{
const struct grant_table *gt = d->grant_table;
unsigned int nr = 0;
/* Don't need the grant lock. This limit is fixed at domain create time. */
switch ( id )
{
case XENMEM_resource_grant_table_id_shared:
nr = gt->max_grant_frames;
break;
case XENMEM_resource_grant_table_id_status:
if ( GNTTAB_MAX_VERSION < 2 )
break;
nr = grant_to_status_frames(gt->max_grant_frames);
break;
}
return nr;
}
int gnttab_acquire_resource(
struct domain *d, unsigned int id, unsigned int frame,
unsigned int nr_frames, xen_pfn_t mfn_list[])
{
struct grant_table *gt = d->grant_table;
unsigned int i, final_frame;
mfn_t tmp;
void **vaddrs = NULL;
int rc = -EINVAL;
if ( !nr_frames )
return rc;
final_frame = frame + nr_frames - 1;
/* Grow table if necessary. */
grant_write_lock(gt);
switch ( id )
{
case XENMEM_resource_grant_table_id_shared:
vaddrs = gt->shared_raw;
rc = gnttab_get_shared_frame_mfn(d, final_frame, &tmp);
break;
case XENMEM_resource_grant_table_id_status:
if ( gt->gt_version != 2 )
break;
/* Check that void ** is a suitable representation for gt->status. */
BUILD_BUG_ON(!__builtin_types_compatible_p(
typeof(gt->status), grant_status_t **));
vaddrs = (void **)gt->status;
rc = gnttab_get_status_frame_mfn(d, final_frame, &tmp);
break;
}
/*
* Some older toolchains can't spot that vaddrs won't remain uninitialized
* on non-error paths, and hence it needs setting to NULL at the top of the
* function. Leave some runtime safety.
*/
if ( !rc && !vaddrs )
{
ASSERT_UNREACHABLE();
rc = -ENODATA;
}
/* Any errors? Bad id, or from growing the table? */
if ( rc )
goto out;
for ( i = 0; i < nr_frames; ++i )
mfn_list[i] = virt_to_mfn(vaddrs[frame + i]);
/* Success. Passed nr_frames back to the caller. */
rc = nr_frames;
out:
grant_write_unlock(gt);
return rc;
}
int gnttab_map_frame(struct domain *d, unsigned long idx, gfn_t gfn, mfn_t *mfn)
{
int rc = 0;
struct grant_table *gt = d->grant_table;
bool status = false;
if ( gfn_eq(gfn, INVALID_GFN) )
{
ASSERT_UNREACHABLE();
return -EINVAL;
}
grant_write_lock(gt);
if ( evaluate_nospec(gt->gt_version == 2) && (idx & XENMAPIDX_grant_table_status) )
{
idx &= ~XENMAPIDX_grant_table_status;
status = true;
rc = gnttab_get_status_frame_mfn(d, idx, mfn);
}
else
rc = gnttab_get_shared_frame_mfn(d, idx, mfn);
if ( !rc )
{
struct page_info *pg = mfn_to_page(*mfn);
/*
* Make sure gnttab_unpopulate_status_frames() won't (successfully)
* free the page until our caller has completed its operation.
*/
if ( !get_page(pg, d) )
rc = -EBUSY;
else if ( (rc = gnttab_set_frame_gfn(gt, status, idx, gfn, *mfn)) )
put_page(pg);
}
grant_write_unlock(gt);
return rc;
}
static void gnttab_usage_print(struct domain *rd)
{
int first = 1;
grant_ref_t ref;
struct grant_table *gt = rd->grant_table;
unsigned int nr_ents;
printk(" -------- active -------- -------- shared --------\n");
printk("[ref] localdom mfn pin localdom gmfn flags\n");
grant_read_lock(gt);
printk("grant-table for remote d%d (v%u)\n"
" %u frames (%u max), %u maptrack frames (%u max)\n",
rd->domain_id, gt->gt_version,
nr_grant_frames(gt), gt->max_grant_frames,
nr_maptrack_frames(gt), gt->max_maptrack_frames);
nr_ents = nr_grant_entries(gt);
for ( ref = 0; ref != nr_ents; ref++ )
{
struct active_grant_entry *act;
struct grant_entry_header *sha;
uint16_t status;
uint64_t frame;
act = active_entry_acquire(gt, ref);
if ( !act->pin )
{
active_entry_release(act);
continue;
}
sha = shared_entry_header(gt, ref);
if ( gt->gt_version == 1 )
{
status = sha->flags;
frame = shared_entry_v1(gt, ref).frame;
}
else
{
frame = shared_entry_v2(gt, ref).full_page.frame;
status = status_entry(gt, ref);
}
first = 0;
/* [0xXXX] ddddd 0xXXXXX 0xXXXXXXXX ddddd 0xXXXXXX 0xXX */
printk("[0x%03x] %5d 0x%"PRI_mfn" 0x%08x %5d 0x%06"PRIx64" 0x%02x\n",
ref, act->domid, mfn_x(act->mfn), act->pin,
sha->domid, frame, status);
active_entry_release(act);
}
grant_read_unlock(gt);
if ( first )
printk("no active grant table entries\n");
}
static void cf_check gnttab_usage_print_all(unsigned char key)
{
struct domain *d;
printk("%s [ key '%c' pressed\n", __func__, key);
rcu_read_lock(&domlist_read_lock);
for_each_domain ( d )
gnttab_usage_print(d);
rcu_read_unlock(&domlist_read_lock);
printk("%s ] done\n", __func__);
}
static int __init cf_check gnttab_usage_init(void)
{
register_keyhandler('g', gnttab_usage_print_all,
"print grant table usage", 1);
return 0;
}
__initcall(gnttab_usage_init);
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/