36 files changed, 1941 insertions, 318 deletions
diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt
new file mode 100644
index 000000000000..5518465290bf
--- /dev/null
+++ b/Documentation/target/tcmu-design.txt
@@ -0,0 +1,378 @@
+Contents:
+
+1) TCM Userspace Design
+  a) Background
+  b) Benefits
+  c) Design constraints
+  d) Implementation overview
+     i. Mailbox
+     ii. Command ring
+     iii. Data Area
+  e) Device discovery
+  f) Device events
+  g) Other contingencies
+2) Writing a user pass-through handler
+  a) Discovering and configuring TCMU uio devices
+  b) Waiting for events on the device(s)
+  c) Managing the command ring
+3) Command filtering and pass_level
+4) A final note
+
+
+TCM Userspace Design
+--------------------
+
+TCM is another name for LIO, an in-kernel iSCSI target (server).
+Existing TCM targets run in the kernel.  TCMU (TCM in Userspace)
+allows userspace programs to be written which act as iSCSI targets.
+This document describes the design.
+
+The existing kernel provides modules for different SCSI transport
+protocols.  TCM also modularizes the data storage.  There are existing
+modules for file, block device, RAM or using another SCSI device as
+storage.  These are called "backstores" or "storage engines".  These
+built-in modules are implemented entirely as kernel code.
+
+Background:
+
+In addition to modularizing the transport protocol used for carrying
+SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes
+the actual data storage as well. These are referred to as "backstores"
+or "storage engines". The target comes with backstores that allow a
+file, a block device, RAM, or another SCSI device to be used for the
+local storage needed for the exported SCSI LUN. Like the rest of LIO,
+these are implemented entirely as kernel code.
+
+These backstores cover the most common use cases, but not all. One new
+use case that other non-kernel target solutions, such as tgt, are able
+to support is using Gluster's GLFS or Ceph's RBD as a backstore. The
+target then serves as a translator, allowing initiators to store data
+in these non-traditional networked storage systems, while still only
+using standard protocols themselves.
+
+If the target is a userspace process, supporting these is easy. tgt,
+for example, needs only a small adapter module for each, because the
+modules just use the available userspace libraries for RBD and GLFS.
+
+Adding support for these backstores in LIO is considerably more
+difficult, because LIO is entirely kernel code. Instead of undertaking
+the significant work to port the GLFS or RBD APIs and protocols to the
+kernel, another approach is to create a userspace pass-through
+backstore for LIO, "TCMU".
+
+
+Benefits:
+
+In addition to allowing relatively easy support for RBD and GLFS, TCMU
+will also allow easier development of new backstores. TCMU combines
+with the LIO loopback fabric to become something similar to FUSE
+(Filesystem in Userspace), but at the SCSI layer instead of the
+filesystem layer. A SUSE, if you will.
+
+The disadvantage is there are more distinct components to configure, and
+potentially to malfunction. This is unavoidable, but hopefully not
+fatal if we're careful to keep things as simple as possible.
+
+Design constraints:
+
+- Good performance: high throughput, low latency
+- Cleanly handle if userspace:
+   1) never attaches
+   2) hangs
+   3) dies
+   4) misbehaves
+- Allow future flexibility in user & kernel implementations
+- Be reasonably memory-efficient
+- Simple to configure & run
+- Simple to write a userspace backend
+
+
+Implementation overview:
+
+The core of the TCMU interface is a memory region that is shared
+between kernel and userspace. Within this region is: a control area
+(mailbox); a lockless producer/consumer circular buffer for commands
+to be passed up, and status returned; and an in/out data buffer area.
+
+TCMU uses the pre-existing UIO subsystem. UIO allows device driver
+development in userspace, and this is conceptually very close to the
+TCMU use case, except instead of a physical device, TCMU implements a
+memory-mapped layout designed for SCSI commands. Using UIO also
+benefits TCMU by handling device introspection (e.g. a way for
+userspace to determine how large the shared region is) and signaling
+mechanisms in both directions.
+
+There are no embedded pointers in the memory region. Everything is
+expressed as an offset from the region's starting address. This allows
+the ring to still work if the user process dies and is restarted with
+the region mapped at a different virtual address.
+
+See target_core_user.h for the struct definitions.
+
+The Mailbox:
+
+The mailbox is always at the start of the shared memory region, and
+contains a version, details about the starting offset and size of the
+command ring, and head and tail pointers to be used by the kernel and
+userspace (respectively) to put commands on the ring, and indicate
+when the commands are completed.
+
+version - 1 (userspace should abort if otherwise)
+flags - none yet defined.
+cmdr_off - The offset of the start of the command ring from the start
+of the memory region, to account for the mailbox size.
+cmdr_size - The size of the command ring. This does *not* need to be a
+power of two.
+cmd_head - Modified by the kernel to indicate when a command has been
+placed on the ring.
+cmd_tail - Modified by userspace to indicate when it has completed
+processing of a command.
+
+The Command Ring:
+
+Commands are placed on the ring by the kernel incrementing
+mailbox.cmd_head by the size of the command, modulo cmdr_size, and
+then signaling userspace via uio_event_notify(). Once the command is
+completed, userspace updates mailbox.cmd_tail in the same way and
+signals the kernel via a 4-byte write(). When cmd_head equals
+cmd_tail, the ring is empty -- no commands are currently waiting to be
+processed by userspace.
+
+TCMU commands start with a common header containing "len_op", a 32-bit
+value that stores the length, as well as the opcode in the lowest
+unused bits. Currently only two opcodes are defined, TCMU_OP_PAD and
+TCMU_OP_CMD. When userspace encounters a command with PAD opcode, it
+should skip ahead by the bytes in "length". (The kernel inserts PAD
+entries to ensure each CMD entry fits contigously into the circular
+buffer.)
+
+When userspace handles a CMD, it finds the SCSI CDB (Command Data
+Block) via tcmu_cmd_entry.req.cdb_off. This is an offset from the
+start of the overall shared memory region, not the entry. The data
+in/out buffers are accessible via tht req.iov[] array. Note that
+each iov.iov_base is also an offset from the start of the region.
+
+TCMU currently does not support BIDI operations.
+
+When completing a command, userspace sets rsp.scsi_status, and
+rsp.sense_buffer if necessary. Userspace then increments
+mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the
+kernel via the UIO method, a 4-byte write to the file descriptor.
+
+The Data Area:
+
+This is shared-memory space after the command ring. The organization
+of this area is not defined in the TCMU interface, and userspace
+should access only the parts referenced by pending iovs.
+
+
+Device Discovery:
+
+Other devices may be using UIO besides TCMU. Unrelated user processes
+may also be handling different sets of TCMU devices. TCMU userspace
+processes must find their devices by scanning sysfs
+class/uio/uio*/name. For TCMU devices, these names will be of the
+format:
+
+tcm-user/<hba_num>/<device_name>/<subtype>/<path>
+
+where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num>
+and <device_name> allow userspace to find the device's path in the
+kernel target's configfs tree. Assuming the usual mount point, it is
+found at:
+
+/sys/kernel/config/target/core/user_<hba_num>/<device_name>
+
+This location contains attributes such as "hw_block_size", that
+userspace needs to know for correct operation.
+
+<subtype> will be a userspace-process-unique string to identify the
+TCMU device as expecting to be backed by a certain handler, and <path>
+will be an additional handler-specific string for the user process to
+configure the device, if needed. The name cannot contain ':', due to
+LIO limitations.
+
+For all devices so discovered, the user handler opens /dev/uioX and
+calls mmap():
+
+mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0)
+
+where size must be equal to the value read from
+/sys/class/uio/uioX/maps/map0/size.
+
+
+Device Events:
+
+If a new device is added or removed, a notification will be broadcast
+over netlink, using a generic netlink family name of "TCM-USER" and a
+multicast group named "config". This will include the UIO name as
+described in the previous section, as well as the UIO minor
+number. This should allow userspace to identify both the UIO device and
+the LIO device, so that after determining the device is supported
+(based on subtype) it can take the appropriate action.
+
+
+Other contingencies:
+
+Userspace handler process never attaches:
+
+- TCMU will post commands, and then abort them after a timeout period
+  (30 seconds.)
+
+Userspace handler process is killed:
+
+- It is still possible to restart and re-connect to TCMU
+  devices. Command ring is preserved. However, after the timeout period,
+  the kernel will abort pending tasks.
+
+Userspace handler process hangs:
+
+- The kernel will abort pending tasks after a timeout period.
+
+Userspace handler process is malicious:
+
+- The process can trivially break the handling of devices it controls,
+  but should not be able to access kernel memory outside its shared
+  memory areas.
+
+
+Writing a user pass-through handler (with example code)
+-------------------------------------------------------
+
+A user process handing a TCMU device must support the following:
+
+a) Discovering and configuring TCMU uio devices
+b) Waiting for events on the device(s)
+c) Managing the command ring: Parsing operations and commands,
+   performing work as needed, setting response fields (scsi_status and
+   possibly sense_buffer), updating cmd_tail, and notifying the kernel
+   that work has been finished
+
+First, consider instead writing a plugin for tcmu-runner. tcmu-runner
+implements all of this, and provides a higher-level API for plugin
+authors.
+
+TCMU is designed so that multiple unrelated processes can manage TCMU
+devices separately. All handlers should make sure to only open their
+devices, based opon a known subtype string.
+
+a) Discovering and configuring TCMU UIO devices:
+
+(error checking omitted for brevity)
+
+int fd, dev_fd;
+char buf[256];
+unsigned long long map_len;
+void *map;
+
+fd = open("/sys/class/uio/uio0/name", O_RDONLY);
+ret = read(fd, buf, sizeof(buf));
+close(fd);
+buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
+
+/* we only want uio devices whose name is a format we expect */
+if (strncmp(buf, "tcm-user", 8))
+	exit(-1);
+
+/* Further checking for subtype also needed here */
+
+fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY);
+ret = read(fd, buf, sizeof(buf));
+close(fd);
+str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */
+
+map_len = strtoull(buf, NULL, 0);
+
+dev_fd = open("/dev/uio0", O_RDWR);
+map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0);
+
+
+b) Waiting for events on the device(s)
+
+while (1) {
+  char buf[4];
+
+  int ret = read(dev_fd, buf, 4); /* will block */
+
+  handle_device_events(dev_fd, map);
+}
+
+
+c) Managing the command ring
+
+#include <linux/target_core_user.h>
+
+int handle_device_events(int fd, void *map)
+{
+  struct tcmu_mailbox *mb = map;
+  struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
+  int did_some_work = 0;
+
+  /* Process events from cmd ring until we catch up with cmd_head */
+  while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) {
+
+    if (tcmu_hdr_get_op(&ent->hdr) == TCMU_OP_CMD) {
+      uint8_t *cdb = (void *)mb + ent->req.cdb_off;
+      bool success = true;
+
+      /* Handle command here. */
+      printf("SCSI opcode: 0x%x\n", cdb[0]);
+
+      /* Set response fields */
+      if (success)
+        ent->rsp.scsi_status = SCSI_NO_SENSE;
+      else {
+        /* Also fill in rsp->sense_buffer here */
+        ent->rsp.scsi_status = SCSI_CHECK_CONDITION;
+      }
+    }
+    else {
+      /* Do nothing for PAD entries */
+    }
+
+    /* update cmd_tail */
+    mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size;
+    ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;
+    did_some_work = 1;
+  }
+
+  /* Notify the kernel that work has been finished */
+  if (did_some_work) {
+    uint32_t buf = 0;
+
+    write(fd, &buf, 4);
+  }
+
+  return 0;
+}
+
+
+Command filtering and pass_level
+--------------------------------
+
+TCMU supports a "pass_level" option with valid values of 0 or 1.  When
+the value is 0 (the default), nearly all SCSI commands received for
+the device are passed through to the handler. This allows maximum
+flexibility but increases the amount of code required by the handler,
+to support all mandatory SCSI commands. If pass_level is set to 1,
+then only IO-related commands are presented, and the rest are handled
+by LIO's in-kernel command emulation. The commands presented at level
+1 include all versions of:
+
+READ
+WRITE
+WRITE_VERIFY
+XDWRITEREAD
+WRITE_SAME
+COMPARE_AND_WRITE
+SYNCHRONIZE_CACHE
+UNMAP
+
+
+A final note
+------------
+
+Please be careful to return codes as defined by the SCSI
+specifications. These are different than some values defined in the
+scsi/scsi.h include file. For example, CHECK CONDITION's status code
+is 2, not 1.
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 0bea5776bcbc..3effa931fce2 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2185,7 +2185,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 		isert_cmd->tx_desc.num_sge = 2;
 	}
 
-	isert_init_send_wr(isert_conn, isert_cmd, send_wr, true);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
 
 	pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
 
@@ -2871,7 +2871,7 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 				     &isert_cmd->tx_desc.iscsi_header);
 		isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
 		isert_init_send_wr(isert_conn, isert_cmd,
-				   &isert_cmd->tx_desc.send_wr, true);
+				   &isert_cmd->tx_desc.send_wr, false);
 		isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr;
 		wr->send_wr_num += 1;
 	}
@@ -3140,7 +3140,7 @@ isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
 
 accept_wait:
 	ret = down_interruptible(&isert_np->np_sem);
-	if (max_accept > 5)
+	if (ret || max_accept > 5)
 		return -ENODEV;
 
 	spin_lock_bh(&np->np_thread_lock);
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index 829752cfd73f..a902fa1db7af 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -112,6 +112,7 @@ static void qlt_abort_cmd_on_host_reset(struct scsi_qla_host *vha,
 	struct qla_tgt_cmd *cmd);
 static void qlt_alloc_qfull_cmd(struct scsi_qla_host *vha,
 	struct atio_from_isp *atio, uint16_t status, int qfull);
+static void qlt_disable_vha(struct scsi_qla_host *vha);
 /*
  * Global Variables
  */
@@ -210,7 +211,7 @@ static inline void qlt_decr_num_pend_cmds(struct scsi_qla_host *vha)
 	spin_unlock_irqrestore(&vha->hw->tgt.q_full_lock, flags);
 }
 
-void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha,
+static void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha,
 	struct atio_from_isp *atio)
 {
 	ql_dbg(ql_dbg_tgt, vha, 0xe072,
@@ -433,7 +434,7 @@ static int qlt_reset(struct scsi_qla_host *vha, void *iocb, int mcmd)
 #if 0 /* FIXME: Re-enable Global event handling.. */
 		/* Global event */
 		atomic_inc(&ha->tgt.qla_tgt->tgt_global_resets_count);
-		qlt_clear_tgt_db(ha->tgt.qla_tgt, 1);
+		qlt_clear_tgt_db(ha->tgt.qla_tgt);
 		if (!list_empty(&ha->tgt.qla_tgt->sess_list)) {
 			sess = list_entry(ha->tgt.qla_tgt->sess_list.next,
 			    typeof(*sess), sess_list_entry);
@@ -515,7 +516,7 @@ static void qlt_schedule_sess_for_deletion(struct qla_tgt_sess *sess,
 }
 
 /* ha->hardware_lock supposed to be held on entry */
-static void qlt_clear_tgt_db(struct qla_tgt *tgt, bool local_only)
+static void qlt_clear_tgt_db(struct qla_tgt *tgt)
 {
 	struct qla_tgt_sess *sess;
 
@@ -867,7 +868,7 @@ int qlt_stop_phase1(struct qla_tgt *tgt)
 	mutex_lock(&vha->vha_tgt.tgt_mutex);
 	spin_lock_irqsave(&ha->hardware_lock, flags);
 	tgt->tgt_stop = 1;
-	qlt_clear_tgt_db(tgt, true);
+	qlt_clear_tgt_db(tgt);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 	mutex_unlock(&vha->vha_tgt.tgt_mutex);
 	mutex_unlock(&qla_tgt_mutex);
@@ -1462,12 +1463,13 @@ out_err:
 	return -1;
 }
 
-static inline void qlt_unmap_sg(struct scsi_qla_host *vha,
-	struct qla_tgt_cmd *cmd)
+static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
 {
 	struct qla_hw_data *ha = vha->hw;
 
-	BUG_ON(!cmd->sg_mapped);
+	if (!cmd->sg_mapped)
+		return;
+
 	pci_unmap_sg(ha->pdev, cmd->sg, cmd->sg_cnt, cmd->dma_data_direction);
 	cmd->sg_mapped = 0;
 
@@ -2428,8 +2430,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type,
 	return 0;
 
 out_unmap_unlock:
-	if (cmd->sg_mapped)
-		qlt_unmap_sg(vha, cmd);
+	qlt_unmap_sg(vha, cmd);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	return res;
@@ -2506,8 +2507,7 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd)
 	return res;
 
 out_unlock_free_unmap:
-	if (cmd->sg_mapped)
-		qlt_unmap_sg(vha, cmd);
+	qlt_unmap_sg(vha, cmd);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	return res;
@@ -2741,8 +2741,7 @@ done:
 		if (!ha_locked && !in_interrupt())
 			msleep(250); /* just in case */
 
-		if (cmd->sg_mapped)
-			qlt_unmap_sg(vha, cmd);
+		qlt_unmap_sg(vha, cmd);
 		vha->hw->tgt.tgt_ops->free_cmd(cmd);
 	}
 	return;
@@ -3087,8 +3086,7 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle,
 	tfo = se_cmd->se_tfo;
 	cmd->cmd_sent_to_fw = 0;
 
-	if (cmd->sg_mapped)
-		qlt_unmap_sg(vha, cmd);
+	qlt_unmap_sg(vha, cmd);
 
 	if (unlikely(status != CTIO_SUCCESS)) {
 		switch (status & 0xFFFF) {
@@ -5343,7 +5341,7 @@ void qlt_lport_deregister(struct scsi_qla_host *vha)
 EXPORT_SYMBOL(qlt_lport_deregister);
 
 /* Must be called under HW lock */
-void qlt_set_mode(struct scsi_qla_host *vha)
+static void qlt_set_mode(struct scsi_qla_host *vha)
 {
 	struct qla_hw_data *ha = vha->hw;
 
@@ -5364,7 +5362,7 @@ void qlt_set_mode(struct scsi_qla_host *vha)
 }
 
 /* Must be called under HW lock */
-void qlt_clear_mode(struct scsi_qla_host *vha)
+static void qlt_clear_mode(struct scsi_qla_host *vha)
 {
 	struct qla_hw_data *ha = vha->hw;
 
@@ -5428,8 +5426,7 @@ EXPORT_SYMBOL(qlt_enable_vha);
  *
  * Disable Target Mode and reset the adapter
  */
-void
-qlt_disable_vha(struct scsi_qla_host *vha)
+static void qlt_disable_vha(struct scsi_qla_host *vha)
 {
 	struct qla_hw_data *ha = vha->hw;
 	struct qla_tgt *tgt = vha->vha_tgt.qla_tgt;
diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
index 8ff330f7d6f5..332086776dfe 100644
--- a/drivers/scsi/qla2xxx/qla_target.h
+++ b/drivers/scsi/qla2xxx/qla_target.h
@@ -1001,11 +1001,11 @@ struct qla_tgt_prm {
 	struct qla_tgt *tgt;
 	void *pkt;
 	struct scatterlist *sg;	/* cmd data buffer SG vector */
+	unsigned char *sense_buffer;
 	int seg_cnt;
 	int req_cnt;
 	uint16_t rq_result;
 	uint16_t scsi_status;
-	unsigned char *sense_buffer;
 	int sense_buffer_len;
 	int residual;
 	int add_status_pkt;
@@ -1033,10 +1033,6 @@ struct qla_tgt_srr_ctio {
 
 
 extern struct qla_tgt_data qla_target;
-/*
- * Internal function prototypes
- */
-void qlt_disable_vha(struct scsi_qla_host *);
 
 /*
  * Function prototypes for qla_target.c logic used by qla2xxx LLD code.
@@ -1049,8 +1045,6 @@ extern void qlt_lport_deregister(struct scsi_qla_host *);
 extern void qlt_unreg_sess(struct qla_tgt_sess *);
 extern void qlt_fc_port_added(struct scsi_qla_host *, fc_port_t *);
 extern void qlt_fc_port_deleted(struct scsi_qla_host *, fc_port_t *);
-extern void qlt_set_mode(struct scsi_qla_host *ha);
-extern void qlt_clear_mode(struct scsi_qla_host *ha);
 extern int __init qlt_init(void);
 extern void qlt_exit(void);
 extern void qlt_update_vp_map(struct scsi_qla_host *, int);
@@ -1083,13 +1077,9 @@ static inline void qla_reverse_ini_mode(struct scsi_qla_host *ha)
 /*
  * Exported symbols from qla_target.c LLD logic used by qla2xxx code..
  */
-extern void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *,
-	struct atio_from_isp *);
 extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, response_t *);
 extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *);
 extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t);
-extern int qlt_rdy_to_xfer_dif(struct qla_tgt_cmd *);
-extern int qlt_xmit_response_dif(struct qla_tgt_cmd *, int, uint8_t);
 extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
 extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *);
 extern void qlt_free_cmd(struct qla_tgt_cmd *cmd);
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index 031b2961c6b7..73f9feecda72 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -786,7 +786,16 @@ static void tcm_qla2xxx_clear_nacl_from_fcport_map(struct qla_tgt_sess *sess)
 	pr_debug("fc_rport domain: port_id 0x%06x\n", nacl->nport_id);
 
 	node = btree_remove32(&lport->lport_fcport_map, nacl->nport_id);
-	WARN_ON(node && (node != se_nacl));
+	if (WARN_ON(node && (node != se_nacl))) {
+		/*
+		 * The nacl no longer matches what we think it should be.
+		 * Most likely a new dynamic acl has been added while
+		 * someone dropped the hardware lock.  It clearly is a
+		 * bug elsewhere, but this bit can't make things worse.
+		 */
+		btree_insert32(&lport->lport_fcport_map, nacl->nport_id,
+			       node, GFP_ATOMIC);
+	}
 
 	pr_debug("Removed from fcport_map: %p for WWNN: 0x%016LX, port_id: 0x%06x\n",
 	    se_nacl, nacl->nport_wwnn, nacl->nport_id);
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index dc2d84ac5a0e..81d44c477a5b 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -31,6 +31,13 @@ config TCM_PSCSI
 	Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered
 	passthrough access to Linux/SCSI device
 
+config TCM_USER
+	tristate "TCM/USER Subsystem Plugin for Linux"
+	depends on UIO && NET
+	help
+	Say Y here to enable the TCM/USER subsystem plugin for a userspace
+	process to handle requests
+
 source "drivers/target/loopback/Kconfig"
 source "drivers/target/tcm_fc/Kconfig"
 source "drivers/target/iscsi/Kconfig"
diff --git a/drivers/target/Makefile b/drivers/target/Makefile
index 85b012d2f89b..bbb4a7d638ef 100644
--- a/drivers/target/Makefile
+++ b/drivers/target/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TARGET_CORE)	+= target_core_mod.o
 obj-$(CONFIG_TCM_IBLOCK)	+= target_core_iblock.o
 obj-$(CONFIG_TCM_FILEIO)	+= target_core_file.o
 obj-$(CONFIG_TCM_PSCSI)		+= target_core_pscsi.o
+obj-$(CONFIG_TCM_USER)		+= target_core_user.o
 
 # Fabric modules
 obj-$(CONFIG_LOOPBACK_TARGET)	+= loopback/
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 260c3e1e312c..b19e4329ba00 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -3709,7 +3709,6 @@ static inline void iscsit_thread_check_cpumask(
 	struct task_struct *p,
 	int mode)
 {
-	char buf[128];
 	/*
 	 * mode == 1 signals iscsi_target_tx_thread() usage.
 	 * mode == 0 signals iscsi_target_rx_thread() usage.
@@ -3728,8 +3727,6 @@ static inline void iscsit_thread_check_cpumask(
 	 * both TX and RX kthreads are scheduled to run on the
 	 * same CPU.
 	 */
-	memset(buf, 0, 128);
-	cpumask_scnprintf(buf, 128, conn->conn_cpumask);
 	set_cpus_allowed_ptr(p, conn->conn_cpumask);
 }
 
@@ -4326,8 +4323,7 @@ int iscsit_close_connection(
 	if (conn->conn_tx_hash.tfm)
 		crypto_free_hash(conn->conn_tx_hash.tfm);
 
-	if (conn->conn_cpumask)
-		free_cpumask_var(conn->conn_cpumask);
+	free_cpumask_var(conn->conn_cpumask);
 
 	kfree(conn->conn_ops);
 	conn->conn_ops = NULL;
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index ae03f3e5de1e..9059c1e0b26e 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -669,12 +669,10 @@ static ssize_t lio_target_nacl_show_info(
 	} else {
 		sess = se_sess->fabric_sess_ptr;
 
-		if (sess->sess_ops->InitiatorName)
-			rb += sprintf(page+rb, "InitiatorName: %s\n",
-				sess->sess_ops->InitiatorName);
-		if (sess->sess_ops->InitiatorAlias)
-			rb += sprintf(page+rb, "InitiatorAlias: %s\n",
-				sess->sess_ops->InitiatorAlias);
+		rb += sprintf(page+rb, "InitiatorName: %s\n",
+			sess->sess_ops->InitiatorName);
+		rb += sprintf(page+rb, "InitiatorAlias: %s\n",
+			sess->sess_ops->InitiatorAlias);
 
 		rb += sprintf(page+rb, "LIO Session ID: %u   "
 			"ISID: 0x%02x %02x %02x %02x %02x %02x  "
diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c
index 0d1e6ee3e992..a0ae5fc0ad75 100644
--- a/drivers/target/iscsi/iscsi_target_erl0.c
+++ b/drivers/target/iscsi/iscsi_target_erl0.c
@@ -345,7 +345,6 @@ static int iscsit_dataout_check_datasn(
 	struct iscsi_cmd *cmd,
 	unsigned char *buf)
 {
-	int dump = 0, recovery = 0;
 	u32 data_sn = 0;
 	struct iscsi_conn *conn = cmd->conn;
 	struct iscsi_data *hdr = (struct iscsi_data *) buf;
@@ -370,13 +369,11 @@ static int iscsit_dataout_check_datasn(
 		pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x"
 			" higher than expected 0x%08x.\n", cmd->init_task_tag,
 				be32_to_cpu(hdr->datasn), data_sn);
-		recovery = 1;
 		goto recover;
 	} else if (be32_to_cpu(hdr->datasn) < data_sn) {
 		pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x"
 			" lower than expected 0x%08x, discarding payload.\n",
 			cmd->init_task_tag, be32_to_cpu(hdr->datasn), data_sn);
-		dump = 1;
 		goto dump;
 	}
 
@@ -392,8 +389,7 @@ dump:
 	if (iscsit_dump_data_payload(conn, payload_length, 1) < 0)
 		return DATAOUT_CANNOT_RECOVER;
 
-	return (recovery || dump) ? DATAOUT_WITHIN_COMMAND_RECOVERY :
-				DATAOUT_NORMAL;
+	return DATAOUT_WITHIN_COMMAND_RECOVERY;
 }
 
 static int iscsit_dataout_pre_datapduinorder_yes(
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 5e71ac609418..480f2e0ecc11 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -978,8 +978,7 @@ int iscsit_setup_np(
 	return 0;
 fail:
 	np->np_socket = NULL;
-	if (sock)
-		sock_release(sock);
+	sock_release(sock);
 	return ret;
 }
 
@@ -1190,8 +1189,7 @@ old_sess_out:
 	if (!IS_ERR(conn->conn_tx_hash.tfm))
 		crypto_free_hash(conn->conn_tx_hash.tfm);
 
-	if (conn->conn_cpumask)
-		free_cpumask_var(conn->conn_cpumask);
+	free_cpumask_var(conn->conn_cpumask);
 
 	kfree(conn->conn_ops);
 
@@ -1268,8 +1266,6 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
 			iscsit_put_transport(conn->conn_transport);
 			kfree(conn);
 			conn = NULL;
-			if (ret == -ENODEV)
-				goto out;
 			/* Get another socket */
 			return 1;
 		}
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 73355f4fca74..ce87ce9bdb9c 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1481,8 +1481,9 @@ void iscsit_collect_login_stats(
 		if (conn->param_list)
 			intrname = iscsi_find_param_from_key(INITIATORNAME,
 							     conn->param_list);
-		strcpy(ls->last_intr_fail_name,
-		       (intrname ? intrname->value : "Unknown"));
+		strlcpy(ls->last_intr_fail_name,
+		       (intrname ? intrname->value : "Unknown"),
+		       sizeof(ls->last_intr_fail_name));
 
 		ls->last_intr_fail_ip_family = conn->login_family;
 
diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 340de9d92b15..ab3ab27d49b7 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -153,18 +153,11 @@ static int tcm_loop_change_queue_type(struct scsi_device *sdev, int tag)
 /*
  * Locate the SAM Task Attr from struct scsi_cmnd *
  */
-static int tcm_loop_sam_attr(struct scsi_cmnd *sc)
-{
-	if (sc->device->tagged_supported) {
-		switch (sc->tag) {
-		case HEAD_OF_QUEUE_TAG:
-			return MSG_HEAD_TAG;
-		case ORDERED_QUEUE_TAG:
-			return MSG_ORDERED_TAG;
-		default:
-			break;
-		}
-	}
+static int tcm_loop_sam_attr(struct scsi_cmnd *sc, int tag)
+{
+	if (sc->device->tagged_supported &&
+	    sc->device->ordered_tags && tag >= 0)
+		return MSG_ORDERED_TAG;
 
 	return MSG_SIMPLE_TAG;
 }
@@ -227,7 +220,7 @@ static void tcm_loop_submission_work(struct work_struct *work)
 
 	rc = target_submit_cmd_map_sgls(se_cmd, tl_nexus->se_sess, sc->cmnd,
 			&tl_cmd->tl_sense_buf[0], tl_cmd->sc->device->lun,
-			transfer_length, tcm_loop_sam_attr(sc),
+			transfer_length, tcm_loop_sam_attr(sc, tl_cmd->sc_cmd_tag),
 			sc->sc_data_direction, 0,
 			scsi_sglist(sc), scsi_sg_count(sc),
 			sgl_bidi, sgl_bidi_count,
@@ -266,7 +259,7 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 	}
 
 	tl_cmd->sc = sc;
-	tl_cmd->sc_cmd_tag = sc->tag;
+	tl_cmd->sc_cmd_tag = sc->request->tag;
 	INIT_WORK(&tl_cmd->work, tcm_loop_submission_work);
 	queue_work(tcm_loop_workqueue, &tl_cmd->work);
 	return 0;
@@ -370,7 +363,7 @@ static int tcm_loop_abort_task(struct scsi_cmnd *sc)
 	 */
 	tl_tpg = &tl_hba->tl_hba_tpgs[sc->device->id];
 	ret = tcm_loop_issue_tmr(tl_tpg, tl_nexus, sc->device->lun,
-				 sc->tag, TMR_ABORT_TASK);
+				 sc->request->tag, TMR_ABORT_TASK);
 	return (ret == TMR_FUNCTION_COMPLETE) ? SUCCESS : FAILED;
 }
 
@@ -960,8 +953,7 @@ static int tcm_loop_port_link(
 				struct tcm_loop_tpg, tl_se_tpg);
 	struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba;
 
-	atomic_inc(&tl_tpg->tl_tpg_port_count);
-	smp_mb__after_atomic();
+	atomic_inc_mb(&tl_tpg->tl_tpg_port_count);
 	/*
 	 * Add Linux/SCSI struct scsi_device by HCTL
 	 */
@@ -995,8 +987,7 @@ static void tcm_loop_port_unlink(
 	scsi_remove_device(sd);
 	scsi_device_put(sd);
 
-	atomic_dec(&tl_tpg->tl_tpg_port_count);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&tl_tpg->tl_tpg_port_count);
 
 	pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n");
 }
diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c
index fbc5ebb5f761..fb87780929d2 100644
--- a/drivers/target/target_core_alua.c
+++ b/drivers/target/target_core_alua.c
@@ -392,8 +392,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd)
 				if (tg_pt_id != tg_pt_gp->tg_pt_gp_id)
 					continue;
 
-				atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt);
-				smp_mb__after_atomic();
+				atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt);
 
 				spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
@@ -403,8 +402,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd)
 					found = true;
 
 				spin_lock(&dev->t10_alua.tg_pt_gps_lock);
-				atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt);
-				smp_mb__after_atomic();
+				atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt);
 				break;
 			}
 			spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
@@ -998,8 +996,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work)
 		 * every I_T nexus other than the I_T nexus on which the SET
 		 * TARGET PORT GROUPS command
 		 */
-		atomic_inc(&mem->tg_pt_gp_mem_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&mem->tg_pt_gp_mem_ref_cnt);
 		spin_unlock(&tg_pt_gp->tg_pt_gp_lock);
 
 		spin_lock_bh(&port->sep_alua_lock);
@@ -1028,8 +1025,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work)
 		spin_unlock_bh(&port->sep_alua_lock);
 
 		spin_lock(&tg_pt_gp->tg_pt_gp_lock);
-		atomic_dec(&mem->tg_pt_gp_mem_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&mem->tg_pt_gp_mem_ref_cnt);
 	}
 	spin_unlock(&tg_pt_gp->tg_pt_gp_lock);
 	/*
@@ -1063,7 +1059,6 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work)
 		core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state));
 	spin_lock(&dev->t10_alua.tg_pt_gps_lock);
 	atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt);
-	smp_mb__after_atomic();
 	spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
 	if (tg_pt_gp->tg_pt_gp_transition_complete)
@@ -1125,7 +1120,6 @@ static int core_alua_do_transition_tg_pt(
 	 */
 	spin_lock(&dev->t10_alua.tg_pt_gps_lock);
 	atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt);
-	smp_mb__after_atomic();
 	spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
 	if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) {
@@ -1168,7 +1162,6 @@ int core_alua_do_port_transition(
 	spin_lock(&local_lu_gp_mem->lu_gp_mem_lock);
 	lu_gp = local_lu_gp_mem->lu_gp;
 	atomic_inc(&lu_gp->lu_gp_ref_cnt);
-	smp_mb__after_atomic();
 	spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock);
 	/*
 	 * For storage objects that are members of the 'default_lu_gp',
@@ -1184,8 +1177,7 @@ int core_alua_do_port_transition(
 		l_tg_pt_gp->tg_pt_gp_alua_nacl = l_nacl;
 		rc = core_alua_do_transition_tg_pt(l_tg_pt_gp,
 						   new_state, explicit);
-		atomic_dec(&lu_gp->lu_gp_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&lu_gp->lu_gp_ref_cnt);
 		return rc;
 	}
 	/*
@@ -1198,8 +1190,7 @@ int core_alua_do_port_transition(
 				lu_gp_mem_list) {
 
 		dev = lu_gp_mem->lu_gp_mem_dev;
-		atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&lu_gp_mem->lu_gp_mem_ref_cnt);
 		spin_unlock(&lu_gp->lu_gp_lock);
 
 		spin_lock(&dev->t10_alua.tg_pt_gps_lock);
@@ -1227,8 +1218,7 @@ int core_alua_do_port_transition(
 				tg_pt_gp->tg_pt_gp_alua_port = NULL;
 				tg_pt_gp->tg_pt_gp_alua_nacl = NULL;
 			}
-			atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt);
-			smp_mb__after_atomic();
+			atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt);
 			spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 			/*
 			 * core_alua_do_transition_tg_pt() will always return
@@ -1238,16 +1228,14 @@ int core_alua_do_port_transition(
 					new_state, explicit);
 
 			spin_lock(&dev->t10_alua.tg_pt_gps_lock);
-			atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt);
-			smp_mb__after_atomic();
+			atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt);
 			if (rc)
 				break;
 		}
 		spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
 		spin_lock(&lu_gp->lu_gp_lock);
-		atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&lu_gp_mem->lu_gp_mem_ref_cnt);
 	}
 	spin_unlock(&lu_gp->lu_gp_lock);
 
@@ -1260,8 +1248,7 @@ int core_alua_do_port_transition(
 			 core_alua_dump_state(new_state));
 	}
 
-	atomic_dec(&lu_gp->lu_gp_ref_cnt);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&lu_gp->lu_gp_ref_cnt);
 	return rc;
 }
 
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 756def38c77a..79f9296a08ae 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -665,6 +665,9 @@ SE_DEV_ATTR(is_nonrot, S_IRUGO | S_IWUSR);
 DEF_DEV_ATTRIB(emulate_rest_reord);
 SE_DEV_ATTR(emulate_rest_reord, S_IRUGO | S_IWUSR);
 
+DEF_DEV_ATTRIB(force_pr_aptpl);
+SE_DEV_ATTR(force_pr_aptpl, S_IRUGO | S_IWUSR);
+
 DEF_DEV_ATTRIB_RO(hw_block_size);
 SE_DEV_ATTR_RO(hw_block_size);
 
@@ -719,6 +722,7 @@ static struct configfs_attribute *target_core_dev_attrib_attrs[] = {
 	&target_core_dev_attrib_hw_pi_prot_type.attr,
 	&target_core_dev_attrib_pi_prot_format.attr,
 	&target_core_dev_attrib_enforce_pr_isids.attr,
+	&target_core_dev_attrib_force_pr_aptpl.attr,
 	&target_core_dev_attrib_is_nonrot.attr,
 	&target_core_dev_attrib_emulate_rest_reord.attr,
 	&target_core_dev_attrib_hw_block_size.attr,
@@ -1263,7 +1267,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata(
 {
 	unsigned char *i_fabric = NULL, *i_port = NULL, *isid = NULL;
 	unsigned char *t_fabric = NULL, *t_port = NULL;
-	char *orig, *ptr, *arg_p, *opts;
+	char *orig, *ptr, *opts;
 	substring_t args[MAX_OPT_ARGS];
 	unsigned long long tmp_ll;
 	u64 sa_res_key = 0;
@@ -1295,14 +1299,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata(
 		token = match_token(ptr, tokens, args);
 		switch (token) {
 		case Opt_initiator_fabric:
-			i_fabric = match_strdup(&args[0]);
+			i_fabric = match_strdup(args);
 			if (!i_fabric) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			break;
 		case Opt_initiator_node:
-			i_port = match_strdup(&args[0]);
+			i_port = match_strdup(args);
 			if (!i_port) {
 				ret = -ENOMEM;
 				goto out;
@@ -1316,7 +1320,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata(
 			}
 			break;
 		case Opt_initiator_sid:
-			isid = match_strdup(&args[0]);
+			isid = match_strdup(args);
 			if (!isid) {
 				ret = -ENOMEM;
 				goto out;
@@ -1330,15 +1334,9 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata(
 			}
 			break;
 		case Opt_sa_res_key:
-			arg_p = match_strdup(&args[0]);
-			if (!arg_p) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = kstrtoull(arg_p, 0, &tmp_ll);
+			ret = kstrtoull(args->from, 0, &tmp_ll);
 			if (ret < 0) {
-				pr_err("kstrtoull() failed for"
-					" sa_res_key=\n");
+				pr_err("kstrtoull() failed for sa_res_key=\n");
 				goto out;
 			}
 			sa_res_key = (u64)tmp_ll;
@@ -1370,14 +1368,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata(
 		 * PR APTPL Metadata for Target Port
 		 */
 		case Opt_target_fabric:
-			t_fabric = match_strdup(&args[0]);
+			t_fabric = match_strdup(args);
 			if (!t_fabric) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			break;
 		case Opt_target_node:
-			t_port = match_strdup(&args[0]);
+			t_port = match_strdup(args);
 			if (!t_port) {
 				ret = -ENOMEM;
 				goto out;
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 98da90167159..c45f9e907e44 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -224,8 +224,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi(
 		if (port->sep_rtpi != rtpi)
 			continue;
 
-		atomic_inc(&deve->pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&deve->pr_ref_count);
 		spin_unlock_irq(&nacl->device_list_lock);
 
 		return deve;
@@ -1019,6 +1018,23 @@ int se_dev_set_enforce_pr_isids(struct se_device *dev, int flag)
 	return 0;
 }
 
+int se_dev_set_force_pr_aptpl(struct se_device *dev, int flag)
+{
+	if ((flag != 0) && (flag != 1)) {
+		printk(KERN_ERR "Illegal value %d\n", flag);
+		return -EINVAL;
+	}
+	if (dev->export_count) {
+		pr_err("dev[%p]: Unable to set force_pr_aptpl while"
+		       " export_count is %d\n", dev, dev->export_count);
+		return -EINVAL;
+	}
+
+	dev->dev_attrib.force_pr_aptpl = flag;
+	pr_debug("dev[%p]: SE Device force_pr_aptpl: %d\n", dev, flag);
+	return 0;
+}
+
 int se_dev_set_is_nonrot(struct se_device *dev, int flag)
 {
 	if ((flag != 0) && (flag != 1)) {
@@ -1250,24 +1266,16 @@ struct se_lun *core_dev_add_lun(
  *
  *
  */
-int core_dev_del_lun(
+void core_dev_del_lun(
 	struct se_portal_group *tpg,
-	u32 unpacked_lun)
+	struct se_lun *lun)
 {
-	struct se_lun *lun;
-
-	lun = core_tpg_pre_dellun(tpg, unpacked_lun);
-	if (IS_ERR(lun))
-		return PTR_ERR(lun);
-
-	core_tpg_post_dellun(tpg, lun);
-
-	pr_debug("%s_TPG[%u]_LUN[%u] - Deactivated %s Logical Unit from"
+	pr_debug("%s_TPG[%u]_LUN[%u] - Deactivating %s Logical Unit from"
 		" device object\n", tpg->se_tpg_tfo->get_fabric_name(),
-		tpg->se_tpg_tfo->tpg_get_tag(tpg), unpacked_lun,
+		tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun,
 		tpg->se_tpg_tfo->get_fabric_name());
 
-	return 0;
+	core_tpg_remove_lun(tpg, lun);
 }
 
 struct se_lun *core_get_lun_from_tpg(struct se_portal_group *tpg, u32 unpacked_lun)
@@ -1396,8 +1404,7 @@ int core_dev_add_initiator_node_lun_acl(
 
 	spin_lock(&lun->lun_acl_lock);
 	list_add_tail(&lacl->lacl_list, &lun->lun_acl_list);
-	atomic_inc(&lun->lun_acl_count);
-	smp_mb__after_atomic();
+	atomic_inc_mb(&lun->lun_acl_count);
 	spin_unlock(&lun->lun_acl_lock);
 
 	pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for "
@@ -1409,7 +1416,8 @@ int core_dev_add_initiator_node_lun_acl(
 	 * Check to see if there are any existing persistent reservation APTPL
 	 * pre-registrations that need to be enabled for this LUN ACL..
 	 */
-	core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, lacl);
+	core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, nacl,
+					    lacl->mapped_lun);
 	return 0;
 }
 
@@ -1430,8 +1438,7 @@ int core_dev_del_initiator_node_lun_acl(
 
 	spin_lock(&lun->lun_acl_lock);
 	list_del(&lacl->lacl_list);
-	atomic_dec(&lun->lun_acl_count);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&lun->lun_acl_count);
 	spin_unlock(&lun->lun_acl_lock);
 
 	core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun,
@@ -1554,6 +1561,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	dev->dev_attrib.emulate_3pc = DA_EMULATE_3PC;
 	dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE0_PROT;
 	dev->dev_attrib.enforce_pr_isids = DA_ENFORCE_PR_ISIDS;
+	dev->dev_attrib.force_pr_aptpl = DA_FORCE_PR_APTPL;
 	dev->dev_attrib.is_nonrot = DA_IS_NONROT;
 	dev->dev_attrib.emulate_rest_reord = DA_EMULATE_REST_REORD;
 	dev->dev_attrib.max_unmap_lba_count = DA_MAX_UNMAP_LBA_COUNT;
diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7de9f0475d05..0c3f90130b7d 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -320,7 +320,7 @@ static struct config_group *target_fabric_make_mappedlun(
 			struct se_node_acl, acl_group);
 	struct se_portal_group *se_tpg = se_nacl->se_tpg;
 	struct target_fabric_configfs *tf = se_tpg->se_tpg_wwn->wwn_tf;
-	struct se_lun_acl *lacl;
+	struct se_lun_acl *lacl = NULL;
 	struct config_item *acl_ci;
 	struct config_group *lacl_cg = NULL, *ml_stat_grp = NULL;
 	char *buf;
@@ -406,6 +406,7 @@ static struct config_group *target_fabric_make_mappedlun(
 out:
 	if (lacl_cg)
 		kfree(lacl_cg->default_groups);
+	kfree(lacl);
 	kfree(buf);
 	return ERR_PTR(ret);
 }
@@ -821,7 +822,7 @@ static int target_fabric_port_unlink(
 		tf->tf_ops.fabric_pre_unlink(se_tpg, lun);
 	}
 
-	core_dev_del_lun(se_tpg, lun->unpacked_lun);
+	core_dev_del_lun(se_tpg, lun);
 	return 0;
 }
 
@@ -910,16 +911,12 @@ static struct config_group *target_fabric_make_lun(
 				GFP_KERNEL);
 	if (!port_stat_grp->default_groups) {
 		pr_err("Unable to allocate port_stat_grp->default_groups\n");
-		errno = -ENOMEM;
-		goto out;
+		kfree(lun_cg->default_groups);
+		return ERR_PTR(-ENOMEM);
 	}
 	target_stat_setup_port_default_groups(lun);
 
 	return &lun->lun_group;
-out:
-	if (lun_cg)
-		kfree(lun_cg->default_groups);
-	return ERR_PTR(errno);
 }
 
 static void target_fabric_drop_lun(
diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c
index 0d1cf8b4f49f..35bfe77160d8 100644
--- a/drivers/target/target_core_fabric_lib.c
+++ b/drivers/target/target_core_fabric_lib.c
@@ -394,9 +394,9 @@ char *iscsi_parse_pr_out_transport_id(
 	 * If the caller wants the TransportID Length, we set that value for the
 	 * entire iSCSI Tarnsport ID now.
 	 */
-	 if (out_tid_len != NULL) {
-		add_len = ((buf[2] >> 8) & 0xff);
-		add_len |= (buf[3] & 0xff);
+	if (out_tid_len) {
+		/* The shift works thanks to integer promotion rules */
+		add_len = (buf[2] << 8) | buf[3];
 
 		tid_len = strlen(&buf[4]);
 		tid_len += 4; /* Add four bytes for iSCSI Transport ID header */
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 7d6cddaec525..72c83d98662b 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -415,7 +415,7 @@ fd_execute_sync_cache(struct se_cmd *cmd)
 	} else {
 		start = cmd->t_task_lba * dev->dev_attrib.block_size;
 		if (cmd->data_length)
-			end = start + cmd->data_length;
+			end = start + cmd->data_length - 1;
 		else
 			end = LLONG_MAX;
 	}
@@ -680,7 +680,12 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 			struct fd_dev *fd_dev = FD_DEV(dev);
 			loff_t start = cmd->t_task_lba *
 				dev->dev_attrib.block_size;
-			loff_t end = start + cmd->data_length;
+			loff_t end;
+
+			if (cmd->data_length)
+				end = start + cmd->data_length - 1;
+			else
+				end = LLONG_MAX;
 
 			vfs_fsync_range(fd_dev->fd_file, start, end, 1);
 		}
@@ -762,7 +767,9 @@ static ssize_t fd_set_configfs_dev_params(struct se_device *dev,
 			fd_dev->fbd_flags |= FBDF_HAS_SIZE;
 			break;
 		case Opt_fd_buffered_io:
-			match_int(args, &arg);
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
 			if (arg != 1) {
 				pr_err("bogus fd_buffered_io=%d value\n", arg);
 				ret = -EINVAL;
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index de9cab708f45..e31f42f369ff 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -38,6 +38,7 @@ int	se_dev_set_emulate_3pc(struct se_device *, int);
 int	se_dev_set_pi_prot_type(struct se_device *, int);
 int	se_dev_set_pi_prot_format(struct se_device *, int);
 int	se_dev_set_enforce_pr_isids(struct se_device *, int);
+int	se_dev_set_force_pr_aptpl(struct se_device *, int);
 int	se_dev_set_is_nonrot(struct se_device *, int);
 int	se_dev_set_emulate_rest_reord(struct se_device *dev, int);
 int	se_dev_set_queue_depth(struct se_device *, u32);
@@ -46,7 +47,7 @@ int	se_dev_set_fabric_max_sectors(struct se_device *, u32);
 int	se_dev_set_optimal_sectors(struct se_device *, u32);
 int	se_dev_set_block_size(struct se_device *, u32);
 struct se_lun *core_dev_add_lun(struct se_portal_group *, struct se_device *, u32);
-int	core_dev_del_lun(struct se_portal_group *, u32);
+void	core_dev_del_lun(struct se_portal_group *, struct se_lun *);
 struct se_lun *core_get_lun_from_tpg(struct se_portal_group *, u32);
 struct se_lun_acl *core_dev_init_initiator_node_lun_acl(struct se_portal_group *,
 		struct se_node_acl *, u32, int *);
@@ -82,8 +83,7 @@ void	core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
 struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u32);
 int	core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
 		u32, struct se_device *);
-struct se_lun *core_tpg_pre_dellun(struct se_portal_group *, u32 unpacked_lun);
-int	core_tpg_post_dellun(struct se_portal_group *, struct se_lun *);
+void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
 
 /* target_core_transport.c */
 extern struct kmem_cache *se_tmr_req_cache;
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index df357862286e..8c60a1a1ae8d 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -674,8 +674,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 	 */
 	spin_lock(&dev->se_port_lock);
 	list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) {
-		atomic_inc(&port->sep_tg_pt_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&port->sep_tg_pt_ref_cnt);
 		spin_unlock(&dev->se_port_lock);
 
 		spin_lock_bh(&port->sep_alua_lock);
@@ -709,8 +708,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			if (strcmp(nacl->initiatorname, nacl_tmp->initiatorname))
 				continue;
 
-			atomic_inc(&deve_tmp->pr_ref_count);
-			smp_mb__after_atomic();
+			atomic_inc_mb(&deve_tmp->pr_ref_count);
 			spin_unlock_bh(&port->sep_alua_lock);
 			/*
 			 * Grab a configfs group dependency that is released
@@ -722,10 +720,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 			if (ret < 0) {
 				pr_err("core_scsi3_lunacl_depend"
 						"_item() failed\n");
-				atomic_dec(&port->sep_tg_pt_ref_cnt);
-				smp_mb__after_atomic();
-				atomic_dec(&deve_tmp->pr_ref_count);
-				smp_mb__after_atomic();
+				atomic_dec_mb(&port->sep_tg_pt_ref_cnt);
+				atomic_dec_mb(&deve_tmp->pr_ref_count);
 				goto out;
 			}
 			/*
@@ -739,10 +735,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 						nacl_tmp, deve_tmp, NULL,
 						sa_res_key, all_tg_pt, aptpl);
 			if (!pr_reg_atp) {
-				atomic_dec(&port->sep_tg_pt_ref_cnt);
-				smp_mb__after_atomic();
-				atomic_dec(&deve_tmp->pr_ref_count);
-				smp_mb__after_atomic();
+				atomic_dec_mb(&port->sep_tg_pt_ref_cnt);
+				atomic_dec_mb(&deve_tmp->pr_ref_count);
 				core_scsi3_lunacl_undepend_item(deve_tmp);
 				goto out;
 			}
@@ -754,8 +748,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration(
 		spin_unlock_bh(&port->sep_alua_lock);
 
 		spin_lock(&dev->se_port_lock);
-		atomic_dec(&port->sep_tg_pt_ref_cnt);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&port->sep_tg_pt_ref_cnt);
 	}
 	spin_unlock(&dev->se_port_lock);
 
@@ -902,6 +895,7 @@ static int __core_scsi3_check_aptpl_registration(
 	spin_lock(&pr_tmpl->aptpl_reg_lock);
 	list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->aptpl_reg_list,
 				pr_reg_aptpl_list) {
+
 		if (!strcmp(pr_reg->pr_iport, i_port) &&
 		     (pr_reg->pr_res_mapped_lun == deve->mapped_lun) &&
 		    !(strcmp(pr_reg->pr_tport, t_port)) &&
@@ -944,10 +938,10 @@ int core_scsi3_check_aptpl_registration(
 	struct se_device *dev,
 	struct se_portal_group *tpg,
 	struct se_lun *lun,
-	struct se_lun_acl *lun_acl)
+	struct se_node_acl *nacl,
+	u32 mapped_lun)
 {
-	struct se_node_acl *nacl = lun_acl->se_lun_nacl;
-	struct se_dev_entry *deve = nacl->device_list[lun_acl->mapped_lun];
+	struct se_dev_entry *deve = nacl->device_list[mapped_lun];
 
 	if (dev->dev_reservation_flags & DRF_SPC2_RESERVATIONS)
 		return 0;
@@ -1109,8 +1103,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg(
 				if (dev->dev_attrib.enforce_pr_isids)
 					continue;
 			}
-			atomic_inc(&pr_reg->pr_res_holders);
-			smp_mb__after_atomic();
+			atomic_inc_mb(&pr_reg->pr_res_holders);
 			spin_unlock(&pr_tmpl->registration_lock);
 			return pr_reg;
 		}
@@ -1124,8 +1117,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg(
 		if (strcmp(isid, pr_reg->pr_reg_isid))
 			continue;
 
-		atomic_inc(&pr_reg->pr_res_holders);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&pr_reg->pr_res_holders);
 		spin_unlock(&pr_tmpl->registration_lock);
 		return pr_reg;
 	}
@@ -1154,8 +1146,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg(
 
 static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg)
 {
-	atomic_dec(&pr_reg->pr_res_holders);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&pr_reg->pr_res_holders);
 }
 
 static int core_scsi3_check_implicit_release(
@@ -1348,8 +1339,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg)
 	configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys,
 			&tpg->tpg_group.cg_item);
 
-	atomic_dec(&tpg->tpg_pr_ref_count);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&tpg->tpg_pr_ref_count);
 }
 
 static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl)
@@ -1368,16 +1358,14 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl)
 	struct se_portal_group *tpg = nacl->se_tpg;
 
 	if (nacl->dynamic_node_acl) {
-		atomic_dec(&nacl->acl_pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&nacl->acl_pr_ref_count);
 		return;
 	}
 
 	configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys,
 			&nacl->acl_group.cg_item);
 
-	atomic_dec(&nacl->acl_pr_ref_count);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&nacl->acl_pr_ref_count);
 }
 
 static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve)
@@ -1407,8 +1395,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 	 * For nacl->dynamic_node_acl=1
 	 */
 	if (!lun_acl) {
-		atomic_dec(&se_deve->pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&se_deve->pr_ref_count);
 		return;
 	}
 	nacl = lun_acl->se_lun_nacl;
@@ -1417,8 +1404,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve)
 	configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys,
 			&lun_acl->se_lun_group.cg_item);
 
-	atomic_dec(&se_deve->pr_ref_count);
-	smp_mb__after_atomic();
+	atomic_dec_mb(&se_deve->pr_ref_count);
 }
 
 static sense_reason_t
@@ -1551,15 +1537,13 @@ core_scsi3_decode_spec_i_port(
 			if (!i_str)
 				continue;
 
-			atomic_inc(&tmp_tpg->tpg_pr_ref_count);
-			smp_mb__after_atomic();
+			atomic_inc_mb(&tmp_tpg->tpg_pr_ref_count);
 			spin_unlock(&dev->se_port_lock);
 
 			if (core_scsi3_tpg_depend_item(tmp_tpg)) {
 				pr_err(" core_scsi3_tpg_depend_item()"
 					" for tmp_tpg\n");
-				atomic_dec(&tmp_tpg->tpg_pr_ref_count);
-				smp_mb__after_atomic();
+				atomic_dec_mb(&tmp_tpg->tpg_pr_ref_count);
 				ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 				goto out_unmap;
 			}
@@ -1571,10 +1555,8 @@ core_scsi3_decode_spec_i_port(
 			spin_lock_irq(&tmp_tpg->acl_node_lock);
 			dest_node_acl = __core_tpg_get_initiator_node_acl(
 						tmp_tpg, i_str);
-			if (dest_node_acl) {
-				atomic_inc(&dest_node_acl->acl_pr_ref_count);
-				smp_mb__after_atomic();
-			}
+			if (dest_node_acl)
+				atomic_inc_mb(&dest_node_acl->acl_pr_ref_count);
 			spin_unlock_irq(&tmp_tpg->acl_node_lock);
 
 			if (!dest_node_acl) {
@@ -1586,8 +1568,7 @@ core_scsi3_decode_spec_i_port(
 			if (core_scsi3_nodeacl_depend_item(dest_node_acl)) {
 				pr_err("configfs_depend_item() failed"
 					" for dest_node_acl->acl_group\n");
-				atomic_dec(&dest_node_acl->acl_pr_ref_count);
-				smp_mb__after_atomic();
+				atomic_dec_mb(&dest_node_acl->acl_pr_ref_count);
 				core_scsi3_tpg_undepend_item(tmp_tpg);
 				ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 				goto out_unmap;
@@ -1646,8 +1627,7 @@ core_scsi3_decode_spec_i_port(
 		if (core_scsi3_lunacl_depend_item(dest_se_deve)) {
 			pr_err("core_scsi3_lunacl_depend_item()"
 					" failed\n");
-			atomic_dec(&dest_se_deve->pr_ref_count);
-			smp_mb__after_atomic();
+			atomic_dec_mb(&dest_se_deve->pr_ref_count);
 			core_scsi3_nodeacl_undepend_item(dest_node_acl);
 			core_scsi3_tpg_undepend_item(dest_tpg);
 			ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
@@ -3167,15 +3147,13 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key,
 		if (!dest_tf_ops)
 			continue;
 
-		atomic_inc(&dest_se_tpg->tpg_pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&dest_se_tpg->tpg_pr_ref_count);
 		spin_unlock(&dev->se_port_lock);
 
 		if (core_scsi3_tpg_depend_item(dest_se_tpg)) {
 			pr_err("core_scsi3_tpg_depend_item() failed"
 				" for dest_se_tpg\n");
-			atomic_dec(&dest_se_tpg->tpg_pr_ref_count);
-			smp_mb__after_atomic();
+			atomic_dec_mb(&dest_se_tpg->tpg_pr_ref_count);
 			ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 			goto out_put_pr_reg;
 		}
@@ -3271,10 +3249,8 @@ after_iport_check:
 	spin_lock_irq(&dest_se_tpg->acl_node_lock);
 	dest_node_acl = __core_tpg_get_initiator_node_acl(dest_se_tpg,
 				initiator_str);
-	if (dest_node_acl) {
-		atomic_inc(&dest_node_acl->acl_pr_ref_count);
-		smp_mb__after_atomic();
-	}
+	if (dest_node_acl)
+		atomic_inc_mb(&dest_node_acl->acl_pr_ref_count);
 	spin_unlock_irq(&dest_se_tpg->acl_node_lock);
 
 	if (!dest_node_acl) {
@@ -3288,8 +3264,7 @@ after_iport_check:
 	if (core_scsi3_nodeacl_depend_item(dest_node_acl)) {
 		pr_err("core_scsi3_nodeacl_depend_item() for"
 			" dest_node_acl\n");
-		atomic_dec(&dest_node_acl->acl_pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&dest_node_acl->acl_pr_ref_count);
 		dest_node_acl = NULL;
 		ret = TCM_INVALID_PARAMETER_LIST;
 		goto out;
@@ -3313,8 +3288,7 @@ after_iport_check:
 
 	if (core_scsi3_lunacl_depend_item(dest_se_deve)) {
 		pr_err("core_scsi3_lunacl_depend_item() failed\n");
-		atomic_dec(&dest_se_deve->pr_ref_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&dest_se_deve->pr_ref_count);
 		dest_se_deve = NULL;
 		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 		goto out;
@@ -3497,6 +3471,7 @@ static unsigned long long core_scsi3_extract_reservation_key(unsigned char *cdb)
 sense_reason_t
 target_scsi3_emulate_pr_out(struct se_cmd *cmd)
 {
+	struct se_device *dev = cmd->se_dev;
 	unsigned char *cdb = &cmd->t_task_cdb[0];
 	unsigned char *buf;
 	u64 res_key, sa_res_key;
@@ -3561,6 +3536,13 @@ target_scsi3_emulate_pr_out(struct se_cmd *cmd)
 		aptpl = (buf[17] & 0x01);
 		unreg = (buf[17] & 0x02);
 	}
+	/*
+	 * If the backend device has been configured to force APTPL metadata
+	 * write-out, go ahead and propigate aptpl=1 down now.
+	 */
+	if (dev->dev_attrib.force_pr_aptpl)
+		aptpl = 1;
+
 	transport_kunmap_data_sg(cmd);
 	buf = NULL;
 
@@ -3803,7 +3785,7 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd)
 	if (!buf)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-	buf[0] = ((add_len << 8) & 0xff);
+	buf[0] = ((add_len >> 8) & 0xff);
 	buf[1] = (add_len & 0xff);
 	buf[2] |= 0x10; /* CRH: Compatible Reservation Hanlding bit. */
 	buf[2] |= 0x08; /* SIP_C: Specify Initiator Ports Capable bit */
@@ -3879,8 +3861,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd)
 		se_tpg = pr_reg->pr_reg_nacl->se_tpg;
 		add_desc_len = 0;
 
-		atomic_inc(&pr_reg->pr_res_holders);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&pr_reg->pr_res_holders);
 		spin_unlock(&pr_tmpl->registration_lock);
 		/*
 		 * Determine expected length of $FABRIC_MOD specific
@@ -3893,8 +3874,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd)
 			pr_warn("SPC-3 PRIN READ_FULL_STATUS ran"
 				" out of buffer: %d\n", cmd->data_length);
 			spin_lock(&pr_tmpl->registration_lock);
-			atomic_dec(&pr_reg->pr_res_holders);
-			smp_mb__after_atomic();
+			atomic_dec_mb(&pr_reg->pr_res_holders);
 			break;
 		}
 		/*
@@ -3955,8 +3935,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd)
 				se_nacl, pr_reg, &format_code, &buf[off+4]);
 
 		spin_lock(&pr_tmpl->registration_lock);
-		atomic_dec(&pr_reg->pr_res_holders);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&pr_reg->pr_res_holders);
 		/*
 		 * Set the ADDITIONAL DESCRIPTOR LENGTH
 		 */
diff --git a/drivers/target/target_core_pr.h b/drivers/target/target_core_pr.h
index 2ee2936fa0bd..749fd7bb7510 100644
--- a/drivers/target/target_core_pr.h
+++ b/drivers/target/target_core_pr.h
@@ -60,7 +60,7 @@ extern int core_scsi3_alloc_aptpl_registration(
 			unsigned char *, u16, u32, int, int, u8);
 extern int core_scsi3_check_aptpl_registration(struct se_device *,
 			struct se_portal_group *, struct se_lun *,
-			struct se_lun_acl *);
+			struct se_node_acl *, u32);
 extern void core_scsi3_free_pr_reg_from_nacl(struct se_device *,
 					     struct se_node_acl *);
 extern void core_scsi3_free_all_registrations(struct se_device *);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 70d9f6dabba0..7c8291f0bbbc 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -749,14 +749,18 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev,
 				ret = -EINVAL;
 				goto out;
 			}
-			match_int(args, &arg);
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
 			pdv->pdv_host_id = arg;
 			pr_debug("PSCSI[%d]: Referencing SCSI Host ID:"
 				" %d\n", phv->phv_host_id, pdv->pdv_host_id);
 			pdv->pdv_flags |= PDF_HAS_VIRT_HOST_ID;
 			break;
 		case Opt_scsi_channel_id:
-			match_int(args, &arg);
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
 			pdv->pdv_channel_id = arg;
 			pr_debug("PSCSI[%d]: Referencing SCSI Channel"
 				" ID: %d\n",  phv->phv_host_id,
@@ -764,7 +768,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev,
 			pdv->pdv_flags |= PDF_HAS_CHANNEL_ID;
 			break;
 		case Opt_scsi_target_id:
-			match_int(args, &arg);
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
 			pdv->pdv_target_id = arg;
 			pr_debug("PSCSI[%d]: Referencing SCSI Target"
 				" ID: %d\n", phv->phv_host_id,
@@ -772,7 +778,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev,
 			pdv->pdv_flags |= PDF_HAS_TARGET_ID;
 			break;
 		case Opt_scsi_lun_id:
-			match_int(args, &arg);
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
 			pdv->pdv_lun_id = arg;
 			pr_debug("PSCSI[%d]: Referencing SCSI LUN ID:"
 				" %d\n", phv->phv_host_id, pdv->pdv_lun_id);
diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index bd78d9235ac6..ebe62afb957d 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -948,7 +948,7 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops)
 	}
 
 	/* reject any command that we don't have a handler for */
-	if (!(cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) && !cmd->execute_cmd)
+	if (!cmd->execute_cmd)
 		return TCM_UNSUPPORTED_SCSI_OPCODE;
 
 	if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) {
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index f7cd95e8111a..fa5e157db47b 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -64,21 +64,17 @@ int core_tmr_alloc_req(
 }
 EXPORT_SYMBOL(core_tmr_alloc_req);
 
-void core_tmr_release_req(
-	struct se_tmr_req *tmr)
+void core_tmr_release_req(struct se_tmr_req *tmr)
 {
 	struct se_device *dev = tmr->tmr_dev;
 	unsigned long flags;
 
-	if (!dev) {
-		kfree(tmr);
-		return;
+	if (dev) {
+		spin_lock_irqsave(&dev->se_tmr_lock, flags);
+		list_del(&tmr->tmr_list);
+		spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
 	}
 
-	spin_lock_irqsave(&dev->se_tmr_lock, flags);
-	list_del(&tmr->tmr_list);
-	spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
-
 	kfree(tmr);
 }
 
@@ -90,9 +86,8 @@ static void core_tmr_handle_tas_abort(
 	bool remove = true;
 	/*
 	 * TASK ABORTED status (TAS) bit support
-	*/
-	if ((tmr_nacl &&
-	     (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) {
+	 */
+	if ((tmr_nacl && (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) {
 		remove = false;
 		transport_send_task_abort(cmd);
 	}
@@ -120,13 +115,12 @@ void core_tmr_abort_task(
 	struct se_tmr_req *tmr,
 	struct se_session *se_sess)
 {
-	struct se_cmd *se_cmd, *tmp_cmd;
+	struct se_cmd *se_cmd;
 	unsigned long flags;
 	int ref_tag;
 
 	spin_lock_irqsave(&se_sess->sess_cmd_lock, flags);
-	list_for_each_entry_safe(se_cmd, tmp_cmd,
-			&se_sess->sess_cmd_list, se_cmd_list) {
+	list_for_each_entry(se_cmd, &se_sess->sess_cmd_list, se_cmd_list) {
 
 		if (dev != se_cmd->se_dev)
 			continue;
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index be783f717f19..0696de9553d3 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -40,6 +40,7 @@
 #include <target/target_core_fabric.h>
 
 #include "target_core_internal.h"
+#include "target_core_pr.h"
 
 extern struct se_device *g_lun0_dev;
 
@@ -166,6 +167,13 @@ void core_tpg_add_node_to_devs(
 
 		core_enable_device_list_for_node(lun, NULL, lun->unpacked_lun,
 				lun_access, acl, tpg);
+		/*
+		 * Check to see if there are any existing persistent reservation
+		 * APTPL pre-registrations that need to be enabled for this dynamic
+		 * LUN ACL now..
+		 */
+		core_scsi3_check_aptpl_registration(dev, tpg, lun, acl,
+						    lun->unpacked_lun);
 		spin_lock(&tpg->tpg_lun_lock);
 	}
 	spin_unlock(&tpg->tpg_lun_lock);
@@ -335,7 +343,7 @@ void core_tpg_clear_object_luns(struct se_portal_group *tpg)
 			continue;
 
 		spin_unlock(&tpg->tpg_lun_lock);
-		core_dev_del_lun(tpg, lun->unpacked_lun);
+		core_dev_del_lun(tpg, lun);
 		spin_lock(&tpg->tpg_lun_lock);
 	}
 	spin_unlock(&tpg->tpg_lun_lock);
@@ -663,13 +671,6 @@ static int core_tpg_setup_virtual_lun0(struct se_portal_group *se_tpg)
 	return 0;
 }
 
-static void core_tpg_release_virtual_lun0(struct se_portal_group *se_tpg)
-{
-	struct se_lun *lun = &se_tpg->tpg_virt_lun0;
-
-	core_tpg_post_dellun(se_tpg, lun);
-}
-
 int core_tpg_register(
 	struct target_core_fabric_ops *tfo,
 	struct se_wwn *se_wwn,
@@ -773,7 +774,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
 	spin_unlock_irq(&se_tpg->acl_node_lock);
 
 	if (se_tpg->se_tpg_type == TRANSPORT_TPG_TYPE_NORMAL)
-		core_tpg_release_virtual_lun0(se_tpg);
+		core_tpg_remove_lun(se_tpg, &se_tpg->tpg_virt_lun0);
 
 	se_tpg->se_tpg_fabric_ptr = NULL;
 	array_free(se_tpg->tpg_lun_list, TRANSPORT_MAX_LUNS_PER_TPG);
@@ -838,37 +839,7 @@ int core_tpg_add_lun(
 	return 0;
 }
 
-struct se_lun *core_tpg_pre_dellun(
-	struct se_portal_group *tpg,
-	u32 unpacked_lun)
-{
-	struct se_lun *lun;
-
-	if (unpacked_lun > (TRANSPORT_MAX_LUNS_PER_TPG-1)) {
-		pr_err("%s LUN: %u exceeds TRANSPORT_MAX_LUNS_PER_TPG"
-			"-1: %u for Target Portal Group: %u\n",
-			tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun,
-			TRANSPORT_MAX_LUNS_PER_TPG-1,
-			tpg->se_tpg_tfo->tpg_get_tag(tpg));
-		return ERR_PTR(-EOVERFLOW);
-	}
-
-	spin_lock(&tpg->tpg_lun_lock);
-	lun = tpg->tpg_lun_list[unpacked_lun];
-	if (lun->lun_status != TRANSPORT_LUN_STATUS_ACTIVE) {
-		pr_err("%s Logical Unit Number: %u is not active on"
-			" Target Portal Group: %u, ignoring request.\n",
-			tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun,
-			tpg->se_tpg_tfo->tpg_get_tag(tpg));
-		spin_unlock(&tpg->tpg_lun_lock);
-		return ERR_PTR(-ENODEV);
-	}
-	spin_unlock(&tpg->tpg_lun_lock);
-
-	return lun;
-}
-
-int core_tpg_post_dellun(
+void core_tpg_remove_lun(
 	struct se_portal_group *tpg,
 	struct se_lun *lun)
 {
@@ -882,6 +853,4 @@ int core_tpg_post_dellun(
 	spin_unlock(&tpg->tpg_lun_lock);
 
 	percpu_ref_exit(&lun->lun_ref);
-
-	return 0;
 }
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 7fa62fc93e0b..9ea0d5f03f7a 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -232,6 +232,10 @@ void transport_subsystem_check_init(void)
 	if (ret != 0)
 		pr_err("Unable to load target_core_pscsi\n");
 
+	ret = request_module("target_core_user");
+	if (ret != 0)
+		pr_err("Unable to load target_core_user\n");
+
 	sub_api_initialized = 1;
 }
 
@@ -752,8 +756,7 @@ void target_qf_do_work(struct work_struct *work)
 
 	list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) {
 		list_del(&cmd->se_qf_node);
-		atomic_dec(&dev->dev_qf_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&dev->dev_qf_count);
 
 		pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue"
 			" context: %s\n", cmd->se_tfo->get_fabric_name(), cmd,
@@ -1166,7 +1169,6 @@ transport_check_alloc_task_attr(struct se_cmd *cmd)
 	 * Dormant to Active status.
 	 */
 	cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id);
-	smp_mb__after_atomic();
 	pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n",
 			cmd->se_ordered_id, cmd->sam_task_attr,
 			dev->transport->name);
@@ -1722,8 +1724,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
 			 cmd->t_task_cdb[0], cmd->se_ordered_id);
 		return false;
 	case MSG_ORDERED_TAG:
-		atomic_inc(&dev->dev_ordered_sync);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&dev->dev_ordered_sync);
 
 		pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, "
 			 " se_ordered_id: %u\n",
@@ -1740,8 +1741,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
 		/*
 		 * For SIMPLE and UNTAGGED Task Attribute commands
 		 */
-		atomic_inc(&dev->simple_cmds);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&dev->simple_cmds);
 		break;
 	}
 
@@ -1845,8 +1845,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
 		return;
 
 	if (cmd->sam_task_attr == MSG_SIMPLE_TAG) {
-		atomic_dec(&dev->simple_cmds);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&dev->simple_cmds);
 		dev->dev_cur_ordered_id++;
 		pr_debug("Incremented dev->dev_cur_ordered_id: %u for"
 			" SIMPLE: %u\n", dev->dev_cur_ordered_id,
@@ -1857,8 +1856,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
 			" HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id,
 			cmd->se_ordered_id);
 	} else if (cmd->sam_task_attr == MSG_ORDERED_TAG) {
-		atomic_dec(&dev->dev_ordered_sync);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&dev->dev_ordered_sync);
 
 		dev->dev_cur_ordered_id++;
 		pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:"
@@ -1877,8 +1875,7 @@ static void transport_complete_qf(struct se_cmd *cmd)
 	if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) {
 		trace_target_cmd_complete(cmd);
 		ret = cmd->se_tfo->queue_status(cmd);
-		if (ret)
-			goto out;
+		goto out;
 	}
 
 	switch (cmd->data_direction) {
@@ -1916,8 +1913,7 @@ static void transport_handle_queue_full(
 {
 	spin_lock_irq(&dev->qf_cmd_lock);
 	list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list);
-	atomic_inc(&dev->dev_qf_count);
-	smp_mb__after_atomic();
+	atomic_inc_mb(&dev->dev_qf_count);
 	spin_unlock_irq(&cmd->se_dev->qf_cmd_lock);
 
 	schedule_work(&cmd->se_dev->qf_work_queue);
@@ -2896,7 +2892,6 @@ void transport_send_task_abort(struct se_cmd *cmd)
 		if (cmd->se_tfo->write_pending_status(cmd) != 0) {
 			cmd->transport_state |= CMD_T_ABORTED;
 			cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS;
-			smp_mb__after_atomic();
 			return;
 		}
 	}
diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c
index 101858e245b3..1738b1646988 100644
--- a/drivers/target/target_core_ua.c
+++ b/drivers/target/target_core_ua.c
@@ -161,8 +161,7 @@ int core_scsi3_ua_allocate(
 		spin_unlock(&deve->ua_lock);
 		spin_unlock_irq(&nacl->device_list_lock);
 
-		atomic_inc(&deve->ua_count);
-		smp_mb__after_atomic();
+		atomic_inc_mb(&deve->ua_count);
 		return 0;
 	}
 	list_add_tail(&ua->ua_nacl_list, &deve->ua_list);
@@ -174,8 +173,7 @@ int core_scsi3_ua_allocate(
 		nacl->se_tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun,
 		asc, ascq);
 
-	atomic_inc(&deve->ua_count);
-	smp_mb__after_atomic();
+	atomic_inc_mb(&deve->ua_count);
 	return 0;
 }
 
@@ -189,8 +187,7 @@ void core_scsi3_ua_release_all(
 		list_del(&ua->ua_nacl_list);
 		kmem_cache_free(se_ua_cache, ua);
 
-		atomic_dec(&deve->ua_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&deve->ua_count);
 	}
 	spin_unlock(&deve->ua_lock);
 }
@@ -250,8 +247,7 @@ void core_scsi3_ua_for_check_condition(
 		list_del(&ua->ua_nacl_list);
 		kmem_cache_free(se_ua_cache, ua);
 
-		atomic_dec(&deve->ua_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&deve->ua_count);
 	}
 	spin_unlock(&deve->ua_lock);
 	spin_unlock_irq(&nacl->device_list_lock);
@@ -309,8 +305,7 @@ int core_scsi3_ua_clear_for_request_sense(
 		list_del(&ua->ua_nacl_list);
 		kmem_cache_free(se_ua_cache, ua);
 
-		atomic_dec(&deve->ua_count);
-		smp_mb__after_atomic();
+		atomic_dec_mb(&deve->ua_count);
 	}
 	spin_unlock(&deve->ua_lock);
 	spin_unlock_irq(&nacl->device_list_lock);
diff --git a/drivers/target/target_core_ua.h b/drivers/target/target_core_ua.h
index be912b36daae..a6b56b364e7a 100644
--- a/drivers/target/target_core_ua.h
+++ b/drivers/target/target_core_ua.h
@@ -1,4 +1,5 @@
 #ifndef TARGET_CORE_UA_H
+#define TARGET_CORE_UA_H
 
 /*
  * From spc4r17, Table D.1: ASC and ASCQ Assignement
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
new file mode 100644
index 000000000000..9a1b314f6482
--- /dev/null
+++ b/drivers/target/target_core_user.c
@@ -0,0 +1,1167 @@
+/*
+ * Copyright (C) 2013 Shaohua Li <shli@kernel.org>
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/timer.h>
+#include <linux/parser.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <linux/uio_driver.h>
+#include <net/genetlink.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/target_core_backend.h>
+#include <linux/target_core_user.h>
+
+/*
+ * Define a shared-memory interface for LIO to pass SCSI commands and
+ * data to userspace for processing. This is to allow backends that
+ * are too complex for in-kernel support to be possible.
+ *
+ * It uses the UIO framework to do a lot of the device-creation and
+ * introspection work for us.
+ *
+ * See the .h file for how the ring is laid out. Note that while the
+ * command ring is defined, the particulars of the data area are
+ * not. Offset values in the command entry point to other locations
+ * internal to the mmap()ed area. There is separate space outside the
+ * command ring for data buffers. This leaves maximum flexibility for
+ * moving buffer allocations, or even page flipping or other
+ * allocation techniques, without altering the command ring layout.
+ *
+ * SECURITY:
+ * The user process must be assumed to be malicious. There's no way to
+ * prevent it breaking the command ring protocol if it wants, but in
+ * order to prevent other issues we must only ever read *data* from
+ * the shared memory area, not offsets or sizes. This applies to
+ * command ring entries as well as the mailbox. Extra code needed for
+ * this may have a 'UAM' comment.
+ */
+
+
+#define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
+
+#define CMDR_SIZE (16 * 4096)
+#define DATA_SIZE (257 * 4096)
+
+#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
+
+static struct device *tcmu_root_device;
+
+struct tcmu_hba {
+	u32 host_id;
+};
+
+/* User wants all cmds or just some */
+enum passthru_level {
+	TCMU_PASS_ALL = 0,
+	TCMU_PASS_IO,
+	TCMU_PASS_INVALID,
+};
+
+#define TCMU_CONFIG_LEN 256
+
+struct tcmu_dev {
+	struct se_device se_dev;
+
+	char *name;
+	struct se_hba *hba;
+
+#define TCMU_DEV_BIT_OPEN 0
+#define TCMU_DEV_BIT_BROKEN 1
+	unsigned long flags;
+	enum passthru_level pass_level;
+
+	struct uio_info uio_info;
+
+	struct tcmu_mailbox *mb_addr;
+	size_t dev_size;
+	u32 cmdr_size;
+	u32 cmdr_last_cleaned;
+	/* Offset of data ring from start of mb */
+	size_t data_off;
+	size_t data_size;
+	/* Ring head + tail values. */
+	/* Must add data_off and mb_addr to get the address */
+	size_t data_head;
+	size_t data_tail;
+
+	wait_queue_head_t wait_cmdr;
+	/* TODO should this be a mutex? */
+	spinlock_t cmdr_lock;
+
+	struct idr commands;
+	spinlock_t commands_lock;
+
+	struct timer_list timeout;
+
+	char dev_config[TCMU_CONFIG_LEN];
+};
+
+#define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev)
+
+#define CMDR_OFF sizeof(struct tcmu_mailbox)
+
+struct tcmu_cmd {
+	struct se_cmd *se_cmd;
+	struct tcmu_dev *tcmu_dev;
+
+	uint16_t cmd_id;
+
+	/* Can't use se_cmd->data_length when cleaning up expired cmds, because if
+	   cmd has been completed then accessing se_cmd is off limits */
+	size_t data_length;
+
+	unsigned long deadline;
+
+#define TCMU_CMD_BIT_EXPIRED 0
+	unsigned long flags;
+};
+
+static struct kmem_cache *tcmu_cmd_cache;
+
+/* multicast group */
+enum tcmu_multicast_groups {
+	TCMU_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group tcmu_mcgrps[] = {
+	[TCMU_MCGRP_CONFIG] = { .name = "config", },
+};
+
+/* Our generic netlink family */
+static struct genl_family tcmu_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "TCM-USER",
+	.version = 1,
+	.maxattr = TCMU_ATTR_MAX,
+	.mcgrps = tcmu_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(tcmu_mcgrps),
+};
+
+static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
+{
+	struct se_device *se_dev = se_cmd->se_dev;
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	struct tcmu_cmd *tcmu_cmd;
+	int cmd_id;
+
+	tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL);
+	if (!tcmu_cmd)
+		return NULL;
+
+	tcmu_cmd->se_cmd = se_cmd;
+	tcmu_cmd->tcmu_dev = udev;
+	tcmu_cmd->data_length = se_cmd->data_length;
+
+	tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&udev->commands_lock);
+	cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0,
+		USHRT_MAX, GFP_NOWAIT);
+	spin_unlock_irq(&udev->commands_lock);
+	idr_preload_end();
+
+	if (cmd_id < 0) {
+		kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+		return NULL;
+	}
+	tcmu_cmd->cmd_id = cmd_id;
+
+	return tcmu_cmd;
+}
+
+static inline void tcmu_flush_dcache_range(void *vaddr, size_t size)
+{
+	unsigned long offset = (unsigned long) vaddr & ~PAGE_MASK;
+
+	size = round_up(size+offset, PAGE_SIZE);
+	vaddr -= offset;
+
+	while (size) {
+		flush_dcache_page(virt_to_page(vaddr));
+		size -= PAGE_SIZE;
+	}
+}
+
+/*
+ * Some ring helper functions. We don't assume size is a power of 2 so
+ * we can't use circ_buf.h.
+ */
+static inline size_t spc_used(size_t head, size_t tail, size_t size)
+{
+	int diff = head - tail;
+
+	if (diff >= 0)
+		return diff;
+	else
+		return size + diff;
+}
+
+static inline size_t spc_free(size_t head, size_t tail, size_t size)
+{
+	/* Keep 1 byte unused or we can't tell full from empty */
+	return (size - spc_used(head, tail, size) - 1);
+}
+
+static inline size_t head_to_end(size_t head, size_t size)
+{
+	return size - head;
+}
+
+#define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size)
+
+/*
+ * We can't queue a command until we have space available on the cmd ring *and* space
+ * space avail on the data ring.
+ *
+ * Called with ring lock held.
+ */
+static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed)
+{
+	struct tcmu_mailbox *mb = udev->mb_addr;
+	size_t space;
+	u32 cmd_head;
+	size_t cmd_needed;
+
+	tcmu_flush_dcache_range(mb, sizeof(*mb));
+
+	cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
+
+	/*
+	 * If cmd end-of-ring space is too small then we need space for a NOP plus
+	 * original cmd - cmds are internally contiguous.
+	 */
+	if (head_to_end(cmd_head, udev->cmdr_size) >= cmd_size)
+		cmd_needed = cmd_size;
+	else
+		cmd_needed = cmd_size + head_to_end(cmd_head, udev->cmdr_size);
+
+	space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size);
+	if (space < cmd_needed) {
+		pr_debug("no cmd space: %u %u %u\n", cmd_head,
+		       udev->cmdr_last_cleaned, udev->cmdr_size);
+		return false;
+	}
+
+	space = spc_free(udev->data_head, udev->data_tail, udev->data_size);
+	if (space < data_needed) {
+		pr_debug("no data space: %zu %zu %zu\n", udev->data_head,
+		       udev->data_tail, udev->data_size);
+		return false;
+	}
+
+	return true;
+}
+
+static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
+{
+	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
+	struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
+	size_t base_command_size, command_size;
+	struct tcmu_mailbox *mb;
+	struct tcmu_cmd_entry *entry;
+	int i;
+	struct scatterlist *sg;
+	struct iovec *iov;
+	int iov_cnt = 0;
+	uint32_t cmd_head;
+	uint64_t cdb_off;
+
+	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags))
+		return -EINVAL;
+
+	/*
+	 * Must be a certain minimum size for response sense info, but
+	 * also may be larger if the iov array is large.
+	 *
+	 * iovs = sgl_nents+1, for end-of-ring case, plus another 1
+	 * b/c size == offsetof one-past-element.
+	*/
+	base_command_size = max(offsetof(struct tcmu_cmd_entry,
+					 req.iov[se_cmd->t_data_nents + 2]),
+				sizeof(struct tcmu_cmd_entry));
+	command_size = base_command_size
+		+ round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE);
+
+	WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1));
+
+	spin_lock_irq(&udev->cmdr_lock);
+
+	mb = udev->mb_addr;
+	cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
+	if ((command_size > (udev->cmdr_size / 2))
+	    || tcmu_cmd->data_length > (udev->data_size - 1))
+		pr_warn("TCMU: Request of size %zu/%zu may be too big for %u/%zu "
+			"cmd/data ring buffers\n", command_size, tcmu_cmd->data_length,
+			udev->cmdr_size, udev->data_size);
+
+	while (!is_ring_space_avail(udev, command_size, tcmu_cmd->data_length)) {
+		int ret;
+		DEFINE_WAIT(__wait);
+
+		prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
+
+		pr_debug("sleeping for ring space\n");
+		spin_unlock_irq(&udev->cmdr_lock);
+		ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT));
+		finish_wait(&udev->wait_cmdr, &__wait);
+		if (!ret) {
+			pr_warn("tcmu: command timed out\n");
+			return -ETIMEDOUT;
+		}
+
+		spin_lock_irq(&udev->cmdr_lock);
+
+		/* We dropped cmdr_lock, cmd_head is stale */
+		cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
+	}
+
+	/* Insert a PAD if end-of-ring space is too small */
+	if (head_to_end(cmd_head, udev->cmdr_size) < command_size) {
+		size_t pad_size = head_to_end(cmd_head, udev->cmdr_size);
+
+		entry = (void *) mb + CMDR_OFF + cmd_head;
+		tcmu_flush_dcache_range(entry, sizeof(*entry));
+		tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD);
+		tcmu_hdr_set_len(&entry->hdr, pad_size);
+
+		UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size);
+
+		cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
+		WARN_ON(cmd_head != 0);
+	}
+
+	entry = (void *) mb + CMDR_OFF + cmd_head;
+	tcmu_flush_dcache_range(entry, sizeof(*entry));
+	tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD);
+	tcmu_hdr_set_len(&entry->hdr, command_size);
+	entry->cmd_id = tcmu_cmd->cmd_id;
+
+	/*
+	 * Fix up iovecs, and handle if allocation in data ring wrapped.
+	 */
+	iov = &entry->req.iov[0];
+	for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) {
+		size_t copy_bytes = min((size_t)sg->length,
+				     head_to_end(udev->data_head, udev->data_size));
+		void *from = kmap_atomic(sg_page(sg)) + sg->offset;
+		void *to = (void *) mb + udev->data_off + udev->data_head;
+
+		if (tcmu_cmd->se_cmd->data_direction == DMA_TO_DEVICE) {
+			memcpy(to, from, copy_bytes);
+			tcmu_flush_dcache_range(to, copy_bytes);
+		}
+
+		/* Even iov_base is relative to mb_addr */
+		iov->iov_len = copy_bytes;
+		iov->iov_base = (void *) udev->data_off + udev->data_head;
+		iov_cnt++;
+		iov++;
+
+		UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size);
+
+		/* Uh oh, we wrapped the buffer. Must split sg across 2 iovs. */
+		if (sg->length != copy_bytes) {
+			from += copy_bytes;
+			copy_bytes = sg->length - copy_bytes;
+
+			iov->iov_len = copy_bytes;
+			iov->iov_base = (void *) udev->data_off + udev->data_head;
+
+			if (se_cmd->data_direction == DMA_TO_DEVICE) {
+				to = (void *) mb + udev->data_off + udev->data_head;
+				memcpy(to, from, copy_bytes);
+				tcmu_flush_dcache_range(to, copy_bytes);
+			}
+
+			iov_cnt++;
+			iov++;
+
+			UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size);
+		}
+
+		kunmap_atomic(from);
+	}
+	entry->req.iov_cnt = iov_cnt;
+
+	/* All offsets relative to mb_addr, not start of entry! */
+	cdb_off = CMDR_OFF + cmd_head + base_command_size;
+	memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb));
+	entry->req.cdb_off = cdb_off;
+	tcmu_flush_dcache_range(entry, sizeof(*entry));
+
+	UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size);
+	tcmu_flush_dcache_range(mb, sizeof(*mb));
+
+	spin_unlock_irq(&udev->cmdr_lock);
+
+	/* TODO: only if FLUSH and FUA? */
+	uio_event_notify(&udev->uio_info);
+
+	mod_timer(&udev->timeout,
+		round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT)));
+
+	return 0;
+}
+
+static int tcmu_queue_cmd(struct se_cmd *se_cmd)
+{
+	struct se_device *se_dev = se_cmd->se_dev;
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	struct tcmu_cmd *tcmu_cmd;
+	int ret;
+
+	tcmu_cmd = tcmu_alloc_cmd(se_cmd);
+	if (!tcmu_cmd)
+		return -ENOMEM;
+
+	ret = tcmu_queue_cmd_ring(tcmu_cmd);
+	if (ret < 0) {
+		pr_err("TCMU: Could not queue command\n");
+		spin_lock_irq(&udev->commands_lock);
+		idr_remove(&udev->commands, tcmu_cmd->cmd_id);
+		spin_unlock_irq(&udev->commands_lock);
+
+		kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+	}
+
+	return ret;
+}
+
+static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry)
+{
+	struct se_cmd *se_cmd = cmd->se_cmd;
+	struct tcmu_dev *udev = cmd->tcmu_dev;
+
+	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
+		/* cmd has been completed already from timeout, just reclaim data
+		   ring space */
+		UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size);
+		return;
+	}
+
+	if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
+		memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer,
+			       se_cmd->scsi_sense_length);
+
+		UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size);
+	}
+	else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
+		struct scatterlist *sg;
+		int i;
+
+		/* It'd be easier to look at entry's iovec again, but UAM */
+		for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) {
+			size_t copy_bytes;
+			void *to;
+			void *from;
+
+			copy_bytes = min((size_t)sg->length,
+					 head_to_end(udev->data_tail, udev->data_size));
+
+			to = kmap_atomic(sg_page(sg)) + sg->offset;
+			WARN_ON(sg->length + sg->offset > PAGE_SIZE);
+			from = (void *) udev->mb_addr + udev->data_off + udev->data_tail;
+			tcmu_flush_dcache_range(from, copy_bytes);
+			memcpy(to, from, copy_bytes);
+
+			UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size);
+
+			/* Uh oh, wrapped the data buffer for this sg's data */
+			if (sg->length != copy_bytes) {
+				from = (void *) udev->mb_addr + udev->data_off + udev->data_tail;
+				WARN_ON(udev->data_tail);
+				to += copy_bytes;
+				copy_bytes = sg->length - copy_bytes;
+				tcmu_flush_dcache_range(from, copy_bytes);
+				memcpy(to, from, copy_bytes);
+
+				UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size);
+			}
+
+			kunmap_atomic(to);
+		}
+
+	} else if (se_cmd->data_direction == DMA_TO_DEVICE) {
+		UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size);
+	} else {
+		pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction);
+	}
+
+	target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
+	cmd->se_cmd = NULL;
+
+	kmem_cache_free(tcmu_cmd_cache, cmd);
+}
+
+static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
+{
+	struct tcmu_mailbox *mb;
+	LIST_HEAD(cpl_cmds);
+	unsigned long flags;
+	int handled = 0;
+
+	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) {
+		pr_err("ring broken, not handling completions\n");
+		return 0;
+	}
+
+	spin_lock_irqsave(&udev->cmdr_lock, flags);
+
+	mb = udev->mb_addr;
+	tcmu_flush_dcache_range(mb, sizeof(*mb));
+
+	while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) {
+
+		struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned;
+		struct tcmu_cmd *cmd;
+
+		tcmu_flush_dcache_range(entry, sizeof(*entry));
+
+		if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) {
+			UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size);
+			continue;
+		}
+		WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD);
+
+		spin_lock(&udev->commands_lock);
+		cmd = idr_find(&udev->commands, entry->cmd_id);
+		if (cmd)
+			idr_remove(&udev->commands, cmd->cmd_id);
+		spin_unlock(&udev->commands_lock);
+
+		if (!cmd) {
+			pr_err("cmd_id not found, ring is broken\n");
+			set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
+			break;
+		}
+
+		tcmu_handle_completion(cmd, entry);
+
+		UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size);
+
+		handled++;
+	}
+
+	if (mb->cmd_tail == mb->cmd_head)
+		del_timer(&udev->timeout); /* no more pending cmds */
+
+	spin_unlock_irqrestore(&udev->cmdr_lock, flags);
+
+	wake_up(&udev->wait_cmdr);
+
+	return handled;
+}
+
+static int tcmu_check_expired_cmd(int id, void *p, void *data)
+{
+	struct tcmu_cmd *cmd = p;
+
+	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags))
+		return 0;
+
+	if (!time_after(cmd->deadline, jiffies))
+		return 0;
+
+	set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags);
+	target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION);
+	cmd->se_cmd = NULL;
+
+	kmem_cache_free(tcmu_cmd_cache, cmd);
+
+	return 0;
+}
+
+static void tcmu_device_timedout(unsigned long data)
+{
+	struct tcmu_dev *udev = (struct tcmu_dev *)data;
+	unsigned long flags;
+	int handled;
+
+	handled = tcmu_handle_completions(udev);
+
+	pr_warn("%d completions handled from timeout\n", handled);
+
+	spin_lock_irqsave(&udev->commands_lock, flags);
+	idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
+	spin_unlock_irqrestore(&udev->commands_lock, flags);
+
+	/*
+	 * We don't need to wakeup threads on wait_cmdr since they have their
+	 * own timeout.
+	 */
+}
+
+static int tcmu_attach_hba(struct se_hba *hba, u32 host_id)
+{
+	struct tcmu_hba *tcmu_hba;
+
+	tcmu_hba = kzalloc(sizeof(struct tcmu_hba), GFP_KERNEL);
+	if (!tcmu_hba)
+		return -ENOMEM;
+
+	tcmu_hba->host_id = host_id;
+	hba->hba_ptr = tcmu_hba;
+
+	return 0;
+}
+
+static void tcmu_detach_hba(struct se_hba *hba)
+{
+	kfree(hba->hba_ptr);
+	hba->hba_ptr = NULL;
+}
+
+static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
+{
+	struct tcmu_dev *udev;
+
+	udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL);
+	if (!udev)
+		return NULL;
+
+	udev->name = kstrdup(name, GFP_KERNEL);
+	if (!udev->name) {
+		kfree(udev);
+		return NULL;
+	}
+
+	udev->hba = hba;
+
+	init_waitqueue_head(&udev->wait_cmdr);
+	spin_lock_init(&udev->cmdr_lock);
+
+	idr_init(&udev->commands);
+	spin_lock_init(&udev->commands_lock);
+
+	setup_timer(&udev->timeout, tcmu_device_timedout,
+		(unsigned long)udev);
+
+	udev->pass_level = TCMU_PASS_ALL;
+
+	return &udev->se_dev;
+}
+
+static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
+{
+	struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info);
+
+	tcmu_handle_completions(tcmu_dev);
+
+	return 0;
+}
+
+/*
+ * mmap code from uio.c. Copied here because we want to hook mmap()
+ * and this stuff must come along.
+ */
+static int tcmu_find_mem_index(struct vm_area_struct *vma)
+{
+	struct tcmu_dev *udev = vma->vm_private_data;
+	struct uio_info *info = &udev->uio_info;
+
+	if (vma->vm_pgoff < MAX_UIO_MAPS) {
+		if (info->mem[vma->vm_pgoff].size == 0)
+			return -1;
+		return (int)vma->vm_pgoff;
+	}
+	return -1;
+}
+
+static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct tcmu_dev *udev = vma->vm_private_data;
+	struct uio_info *info = &udev->uio_info;
+	struct page *page;
+	unsigned long offset;
+	void *addr;
+
+	int mi = tcmu_find_mem_index(vma);
+	if (mi < 0)
+		return VM_FAULT_SIGBUS;
+
+	/*
+	 * We need to subtract mi because userspace uses offset = N*PAGE_SIZE
+	 * to use mem[N].
+	 */
+	offset = (vmf->pgoff - mi) << PAGE_SHIFT;
+
+	addr = (void *)(unsigned long)info->mem[mi].addr + offset;
+	if (info->mem[mi].memtype == UIO_MEM_LOGICAL)
+		page = virt_to_page(addr);
+	else
+		page = vmalloc_to_page(addr);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct tcmu_vm_ops = {
+	.fault = tcmu_vma_fault,
+};
+
+static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma)
+{
+	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_ops = &tcmu_vm_ops;
+
+	vma->vm_private_data = udev;
+
+	/* Ensure the mmap is exactly the right size */
+	if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int tcmu_open(struct uio_info *info, struct inode *inode)
+{
+	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
+
+	/* O_EXCL not supported for char devs, so fake it? */
+	if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
+		return -EBUSY;
+
+	pr_debug("open\n");
+
+	return 0;
+}
+
+static int tcmu_release(struct uio_info *info, struct inode *inode)
+{
+	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
+
+	clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);
+
+	pr_debug("close\n");
+
+	return 0;
+}
+
+static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor)
+{
+	struct sk_buff *skb;
+	void *msg_header;
+	int ret = -ENOMEM;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return ret;
+
+	msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd);
+	if (!msg_header)
+		goto free_skb;
+
+	ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name);
+	if (ret < 0)
+		goto free_skb;
+
+	ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor);
+	if (ret < 0)
+		goto free_skb;
+
+	ret = genlmsg_end(skb, msg_header);
+	if (ret < 0)
+		goto free_skb;
+
+	ret = genlmsg_multicast(&tcmu_genl_family, skb, 0,
+				TCMU_MCGRP_CONFIG, GFP_KERNEL);
+
+	/* We don't care if no one is listening */
+	if (ret == -ESRCH)
+		ret = 0;
+
+	return ret;
+free_skb:
+	nlmsg_free(skb);
+	return ret;
+}
+
+static int tcmu_configure_device(struct se_device *dev)
+{
+	struct tcmu_dev *udev = TCMU_DEV(dev);
+	struct tcmu_hba *hba = udev->hba->hba_ptr;
+	struct uio_info *info;
+	struct tcmu_mailbox *mb;
+	size_t size;
+	size_t used;
+	int ret = 0;
+	char *str;
+
+	info = &udev->uio_info;
+
+	size = snprintf(NULL, 0, "tcm-user/%u/%s/%s", hba->host_id, udev->name,
+			udev->dev_config);
+	size += 1; /* for \0 */
+	str = kmalloc(size, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	used = snprintf(str, size, "tcm-user/%u/%s", hba->host_id, udev->name);
+
+	if (udev->dev_config[0])
+		snprintf(str + used, size - used, "/%s", udev->dev_config);
+
+	info->name = str;
+
+	udev->mb_addr = vzalloc(TCMU_RING_SIZE);
+	if (!udev->mb_addr) {
+		ret = -ENOMEM;
+		goto err_vzalloc;
+	}
+
+	/* mailbox fits in first part of CMDR space */
+	udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
+	udev->data_off = CMDR_SIZE;
+	udev->data_size = TCMU_RING_SIZE - CMDR_SIZE;
+
+	mb = udev->mb_addr;
+	mb->version = 1;
+	mb->cmdr_off = CMDR_OFF;
+	mb->cmdr_size = udev->cmdr_size;
+
+	WARN_ON(!PAGE_ALIGNED(udev->data_off));
+	WARN_ON(udev->data_size % PAGE_SIZE);
+
+	info->version = "1";
+
+	info->mem[0].name = "tcm-user command & data buffer";
+	info->mem[0].addr = (phys_addr_t) udev->mb_addr;
+	info->mem[0].size = TCMU_RING_SIZE;
+	info->mem[0].memtype = UIO_MEM_VIRTUAL;
+
+	info->irqcontrol = tcmu_irqcontrol;
+	info->irq = UIO_IRQ_CUSTOM;
+
+	info->mmap = tcmu_mmap;
+	info->open = tcmu_open;
+	info->release = tcmu_release;
+
+	ret = uio_register_device(tcmu_root_device, info);
+	if (ret)
+		goto err_register;
+
+	/* Other attributes can be configured in userspace */
+	dev->dev_attrib.hw_block_size = 512;
+	dev->dev_attrib.hw_max_sectors = 128;
+	dev->dev_attrib.hw_queue_depth = 128;
+
+	ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name,
+				 udev->uio_info.uio_dev->minor);
+	if (ret)
+		goto err_netlink;
+
+	return 0;
+
+err_netlink:
+	uio_unregister_device(&udev->uio_info);
+err_register:
+	vfree(udev->mb_addr);
+err_vzalloc:
+	kfree(info->name);
+
+	return ret;
+}
+
+static int tcmu_check_pending_cmd(int id, void *p, void *data)
+{
+	struct tcmu_cmd *cmd = p;
+
+	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags))
+		return 0;
+	return -EINVAL;
+}
+
+static void tcmu_free_device(struct se_device *dev)
+{
+	struct tcmu_dev *udev = TCMU_DEV(dev);
+	int i;
+
+	del_timer_sync(&udev->timeout);
+
+	vfree(udev->mb_addr);
+
+	/* Upper layer should drain all requests before calling this */
+	spin_lock_irq(&udev->commands_lock);
+	i = idr_for_each(&udev->commands, tcmu_check_pending_cmd, NULL);
+	idr_destroy(&udev->commands);
+	spin_unlock_irq(&udev->commands_lock);
+	WARN_ON(i);
+
+	/* Device was configured */
+	if (udev->uio_info.uio_dev) {
+		tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
+				   udev->uio_info.uio_dev->minor);
+
+		uio_unregister_device(&udev->uio_info);
+		kfree(udev->uio_info.name);
+		kfree(udev->name);
+	}
+
+	kfree(udev);
+}
+
+enum {
+	Opt_dev_config, Opt_dev_size, Opt_err, Opt_pass_level,
+};
+
+static match_table_t tokens = {
+	{Opt_dev_config, "dev_config=%s"},
+	{Opt_dev_size, "dev_size=%u"},
+	{Opt_pass_level, "pass_level=%u"},
+	{Opt_err, NULL}
+};
+
+static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev,
+		const char *page, ssize_t count)
+{
+	struct tcmu_dev *udev = TCMU_DEV(dev);
+	char *orig, *ptr, *opts, *arg_p;
+	substring_t args[MAX_OPT_ARGS];
+	int ret = 0, token;
+	int arg;
+
+	opts = kstrdup(page, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+
+	orig = opts;
+
+	while ((ptr = strsep(&opts, ",\n")) != NULL) {
+		if (!*ptr)
+			continue;
+
+		token = match_token(ptr, tokens, args);
+		switch (token) {
+		case Opt_dev_config:
+			if (match_strlcpy(udev->dev_config, &args[0],
+					  TCMU_CONFIG_LEN) == 0) {
+				ret = -EINVAL;
+				break;
+			}
+			pr_debug("TCMU: Referencing Path: %s\n", udev->dev_config);
+			break;
+		case Opt_dev_size:
+			arg_p = match_strdup(&args[0]);
+			if (!arg_p) {
+				ret = -ENOMEM;
+				break;
+			}
+			ret = kstrtoul(arg_p, 0, (unsigned long *) &udev->dev_size);
+			kfree(arg_p);
+			if (ret < 0)
+				pr_err("kstrtoul() failed for dev_size=\n");
+			break;
+		case Opt_pass_level:
+			match_int(args, &arg);
+			if (arg >= TCMU_PASS_INVALID) {
+				pr_warn("TCMU: Invalid pass_level: %d\n", arg);
+				break;
+			}
+
+			pr_debug("TCMU: Setting pass_level to %d\n", arg);
+			udev->pass_level = arg;
+			break;
+		default:
+			break;
+		}
+	}
+
+	kfree(orig);
+	return (!ret) ? count : ret;
+}
+
+static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b)
+{
+	struct tcmu_dev *udev = TCMU_DEV(dev);
+	ssize_t bl = 0;
+
+	bl = sprintf(b + bl, "Config: %s ",
+		     udev->dev_config[0] ? udev->dev_config : "NULL");
+	bl += sprintf(b + bl, "Size: %zu PassLevel: %u\n",
+		      udev->dev_size, udev->pass_level);
+
+	return bl;
+}
+
+static sector_t tcmu_get_blocks(struct se_device *dev)
+{
+	struct tcmu_dev *udev = TCMU_DEV(dev);
+
+	return div_u64(udev->dev_size - dev->dev_attrib.block_size,
+		       dev->dev_attrib.block_size);
+}
+
+static sense_reason_t
+tcmu_execute_rw(struct se_cmd *se_cmd, struct scatterlist *sgl, u32 sgl_nents,
+		enum dma_data_direction data_direction)
+{
+	int ret;
+
+	ret = tcmu_queue_cmd(se_cmd);
+
+	if (ret != 0)
+		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+	else
+		return TCM_NO_SENSE;
+}
+
+static sense_reason_t
+tcmu_pass_op(struct se_cmd *se_cmd)
+{
+	int ret = tcmu_queue_cmd(se_cmd);
+
+	if (ret != 0)
+		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+	else
+		return TCM_NO_SENSE;
+}
+
+static struct sbc_ops tcmu_sbc_ops = {
+	.execute_rw = tcmu_execute_rw,
+	.execute_sync_cache	= tcmu_pass_op,
+	.execute_write_same	= tcmu_pass_op,
+	.execute_write_same_unmap = tcmu_pass_op,
+	.execute_unmap		= tcmu_pass_op,
+};
+
+static sense_reason_t
+tcmu_parse_cdb(struct se_cmd *cmd)
+{
+	unsigned char *cdb = cmd->t_task_cdb;
+	struct tcmu_dev *udev = TCMU_DEV(cmd->se_dev);
+	sense_reason_t ret;
+
+	switch (udev->pass_level) {
+	case TCMU_PASS_ALL:
+		/* We're just like pscsi, then */
+		/*
+		 * For REPORT LUNS we always need to emulate the response, for everything
+		 * else, pass it up.
+		 */
+		switch (cdb[0]) {
+		case REPORT_LUNS:
+			cmd->execute_cmd = spc_emulate_report_luns;
+			break;
+		case READ_6:
+		case READ_10:
+		case READ_12:
+		case READ_16:
+		case WRITE_6:
+		case WRITE_10:
+		case WRITE_12:
+		case WRITE_16:
+		case WRITE_VERIFY:
+			cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB;
+			/* FALLTHROUGH */
+		default:
+			cmd->execute_cmd = tcmu_pass_op;
+		}
+		ret = TCM_NO_SENSE;
+		break;
+	case TCMU_PASS_IO:
+		ret = sbc_parse_cdb(cmd, &tcmu_sbc_ops);
+		break;
+	default:
+		pr_err("Unknown tcm-user pass level %d\n", udev->pass_level);
+		ret = TCM_CHECK_CONDITION_ABORT_CMD;
+	}
+
+	return ret;
+}
+
+static struct se_subsystem_api tcmu_template = {
+	.name			= "user",
+	.inquiry_prod		= "USER",
+	.inquiry_rev		= TCMU_VERSION,
+	.owner			= THIS_MODULE,
+	.transport_type		= TRANSPORT_PLUGIN_VHBA_PDEV,
+	.attach_hba		= tcmu_attach_hba,
+	.detach_hba		= tcmu_detach_hba,
+	.alloc_device		= tcmu_alloc_device,
+	.configure_device	= tcmu_configure_device,
+	.free_device		= tcmu_free_device,
+	.parse_cdb		= tcmu_parse_cdb,
+	.set_configfs_dev_params = tcmu_set_configfs_dev_params,
+	.show_configfs_dev_params = tcmu_show_configfs_dev_params,
+	.get_device_type	= sbc_get_device_type,
+	.get_blocks		= tcmu_get_blocks,
+};
+
+static int __init tcmu_module_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0);
+
+	tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache",
+				sizeof(struct tcmu_cmd),
+				__alignof__(struct tcmu_cmd),
+				0, NULL);
+	if (!tcmu_cmd_cache)
+		return -ENOMEM;
+
+	tcmu_root_device = root_device_register("tcm_user");
+	if (IS_ERR(tcmu_root_device)) {
+		ret = PTR_ERR(tcmu_root_device);
+		goto out_free_cache;
+	}
+
+	ret = genl_register_family(&tcmu_genl_family);
+	if (ret < 0) {
+		goto out_unreg_device;
+	}
+
+	ret = transport_subsystem_register(&tcmu_template);
+	if (ret)
+		goto out_unreg_genl;
+
+	return 0;
+
+out_unreg_genl:
+	genl_unregister_family(&tcmu_genl_family);
+out_unreg_device:
+	root_device_unregister(tcmu_root_device);
+out_free_cache:
+	kmem_cache_destroy(tcmu_cmd_cache);
+
+	return ret;
+}
+
+static void __exit tcmu_module_exit(void)
+{
+	transport_subsystem_release(&tcmu_template);
+	genl_unregister_family(&tcmu_genl_family);
+	root_device_unregister(tcmu_root_device);
+	kmem_cache_destroy(tcmu_cmd_cache);
+}
+
+MODULE_DESCRIPTION("TCM USER subsystem plugin");
+MODULE_AUTHOR("Shaohua Li <shli@kernel.org>");
+MODULE_AUTHOR("Andy Grover <agrover@redhat.com>");
+MODULE_LICENSE("GPL");
+
+module_init(tcmu_module_init);
+module_exit(tcmu_module_exit);
diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c
index 21ce50880c79..ccee7e332a4d 100644
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@@ -98,7 +98,7 @@ static void ft_tport_delete(struct ft_tport *tport)
 	ft_sess_delete_all(tport);
 	lport = tport->lport;
 	BUG_ON(tport != lport->prov[FC_TYPE_FCP]);
-	rcu_assign_pointer(lport->prov[FC_TYPE_FCP], NULL);
+	RCU_INIT_POINTER(lport->prov[FC_TYPE_FCP], NULL);
 
 	tpg = tport->tpg;
 	if (tpg) {
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index a673e5b6a2e0..60fa6278fbce 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -28,18 +28,6 @@
 
 #define UIO_MAX_DEVICES		(1U << MINORBITS)
 
-struct uio_device {
-	struct module		*owner;
-	struct device		*dev;
-	int			minor;
-	atomic_t		event;
-	struct fasync_struct	*async_queue;
-	wait_queue_head_t	wait;
-	struct uio_info		*info;
-	struct kobject		*map_dir;
-	struct kobject		*portio_dir;
-};
-
 static int uio_major;
 static struct cdev *uio_cdev;
 static DEFINE_IDR(uio_idr);
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
index 1ad4724458de..baa81718d985 100644
--- a/include/linux/uio_driver.h
+++ b/include/linux/uio_driver.h
@@ -63,7 +63,17 @@ struct uio_port {
 
 #define MAX_UIO_PORT_REGIONS	5
 
-struct uio_device;
+struct uio_device {
+        struct module           *owner;
+        struct device           *dev;
+        int                     minor;
+        atomic_t                event;
+        struct fasync_struct    *async_queue;
+        wait_queue_head_t       wait;
+        struct uio_info         *info;
+        struct kobject          *map_dir;
+        struct kobject          *portio_dir;
+};
 
 /**
  * struct uio_info - UIO device capabilities
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 9ec9864ecf38..23c518a0340c 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -108,6 +108,8 @@
 #define DA_EMULATE_ALUA				0
 /* Enforce SCSI Initiator Port TransportID with 'ISID' for PR */
 #define DA_ENFORCE_PR_ISIDS			1
+/* Force SPC-3 PR Activate Persistence across Target Power Loss */
+#define DA_FORCE_PR_APTPL			0
 #define DA_STATUS_MAX_SECTORS_MIN		16
 #define DA_STATUS_MAX_SECTORS_MAX		8192
 /* By default don't report non-rotating (solid state) medium */
@@ -680,6 +682,7 @@ struct se_dev_attrib {
 	enum target_prot_type pi_prot_type;
 	enum target_prot_type hw_pi_prot_type;
 	int		enforce_pr_isids;
+	int		force_pr_aptpl;
 	int		is_nonrot;
 	int		emulate_rest_reord;
 	u32		hw_block_size;
@@ -903,4 +906,18 @@ struct se_wwn {
 	struct config_group	fabric_stat_group;
 };
 
+static inline void atomic_inc_mb(atomic_t *v)
+{
+	smp_mb__before_atomic();
+	atomic_inc(v);
+	smp_mb__after_atomic();
+}
+
+static inline void atomic_dec_mb(atomic_t *v)
+{
+	smp_mb__before_atomic();
+	atomic_dec(v);
+	smp_mb__after_atomic();
+}
+
 #endif /* TARGET_CORE_BASE_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 6cad97485bad..b70237e8bc37 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -374,6 +374,7 @@ header-y += swab.h
 header-y += synclink.h
 header-y += sysctl.h
 header-y += sysinfo.h
+header-y += target_core_user.h
 header-y += taskstats.h
 header-y += tcp.h
 header-y += tcp_metrics.h
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
new file mode 100644
index 000000000000..7dcfbe6771b1
--- /dev/null
+++ b/include/uapi/linux/target_core_user.h
@@ -0,0 +1,142 @@
+#ifndef __TARGET_CORE_USER_H
+#define __TARGET_CORE_USER_H
+
+/* This header will be used by application too */
+
+#include <linux/types.h>
+#include <linux/uio.h>
+
+#ifndef __packed
+#define __packed                        __attribute__((packed))
+#endif
+
+#define TCMU_VERSION "1.0"
+
+/*
+ * Ring Design
+ * -----------
+ *
+ * The mmaped area is divided into three parts:
+ * 1) The mailbox (struct tcmu_mailbox, below)
+ * 2) The command ring
+ * 3) Everything beyond the command ring (data)
+ *
+ * The mailbox tells userspace the offset of the command ring from the
+ * start of the shared memory region, and how big the command ring is.
+ *
+ * The kernel passes SCSI commands to userspace by putting a struct
+ * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking
+ * userspace via uio's interrupt mechanism.
+ *
+ * tcmu_cmd_entry contains a header. If the header type is PAD,
+ * userspace should skip hdr->length bytes (mod cmdr_size) to find the
+ * next cmd_entry.
+ *
+ * Otherwise, the entry will contain offsets into the mmaped area that
+ * contain the cdb and data buffers -- the latter accessible via the
+ * iov array. iov addresses are also offsets into the shared area.
+ *
+ * When userspace is completed handling the command, set
+ * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate,
+ * and also set mailbox->cmd_tail equal to the old cmd_tail plus
+ * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it
+ * should process the next packet the same way, and so on.
+ */
+
+#define TCMU_MAILBOX_VERSION 1
+#define ALIGN_SIZE 64 /* Should be enough for most CPUs */
+
+struct tcmu_mailbox {
+	__u16 version;
+	__u16 flags;
+	__u32 cmdr_off;
+	__u32 cmdr_size;
+
+	__u32 cmd_head;
+
+	/* Updated by user. On its own cacheline */
+	__u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE)));
+
+} __packed;
+
+enum tcmu_opcode {
+	TCMU_OP_PAD = 0,
+	TCMU_OP_CMD,
+};
+
+/*
+ * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode.
+ */
+struct tcmu_cmd_entry_hdr {
+		__u32 len_op;
+} __packed;
+
+#define TCMU_OP_MASK 0x7
+
+static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr)
+{
+	return hdr->len_op & TCMU_OP_MASK;
+}
+
+static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op)
+{
+	hdr->len_op &= ~TCMU_OP_MASK;
+	hdr->len_op |= (op & TCMU_OP_MASK);
+}
+
+static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr)
+{
+	return hdr->len_op & ~TCMU_OP_MASK;
+}
+
+static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len)
+{
+	hdr->len_op &= TCMU_OP_MASK;
+	hdr->len_op |= len;
+}
+
+/* Currently the same as SCSI_SENSE_BUFFERSIZE */
+#define TCMU_SENSE_BUFFERSIZE 96
+
+struct tcmu_cmd_entry {
+	struct tcmu_cmd_entry_hdr hdr;
+
+	uint16_t cmd_id;
+	uint16_t __pad1;
+
+	union {
+		struct {
+			uint64_t cdb_off;
+			uint64_t iov_cnt;
+			struct iovec iov[0];
+		} req;
+		struct {
+			uint8_t scsi_status;
+			uint8_t __pad1;
+			uint16_t __pad2;
+			uint32_t __pad3;
+			char sense_buffer[TCMU_SENSE_BUFFERSIZE];
+		} rsp;
+	};
+
+} __packed;
+
+#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t)
+
+enum tcmu_genl_cmd {
+	TCMU_CMD_UNSPEC,
+	TCMU_CMD_ADDED_DEVICE,
+	TCMU_CMD_REMOVED_DEVICE,
+	__TCMU_CMD_MAX,
+};
+#define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1)
+
+enum tcmu_genl_attr {
+	TCMU_ATTR_UNSPEC,
+	TCMU_ATTR_DEVICE,
+	TCMU_ATTR_MINOR,
+	__TCMU_ATTR_MAX,
+};
+#define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1)
+
+#endif