]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
sparc64: Oracle Data Analytics Accelerator (DAX) driver
authorSanath Kumar <sanath.s.kumar@oracle.com>
Wed, 29 Mar 2017 17:46:42 +0000 (12:46 -0500)
committerChuck Anderson <chuck.anderson@oracle.com>
Mon, 24 Apr 2017 04:43:25 +0000 (21:43 -0700)
Orabug: 23072809

DAX is a coprocessor which resides on the SPARC M7 processor chip, and
has direct access to the CPU's L3 caches as well as physical
memory. It can perform several operations on data streams with
various input and output formats.  The driver is merely a transport
mechanism and does not have knowledge of the various opcodes and data
formats. A user space library provides high level services and
translates these into low level commands which are then passed into
the driver and subsequently the hypervisor and the coprocessor.

See Documentation/sparc/dax.txt for more details.

Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: David Aldridge <david.j.aldridge@oracle.com>
Reviewed-by: Stanislav Kholmanskikh <stanislav.kholmanskikh@oracle.com>
Signed-off-by: Rob Gardner <rob.gardner@oracle.com>
Signed-off-by: Sanath Kumar <sanath.s.kumar@oracle.com>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
14 files changed:
Documentation/sparc/dax.txt [new file with mode: 0644]
arch/sparc/Kconfig
arch/sparc/Makefile
arch/sparc/dax/Makefile [new file with mode: 0644]
arch/sparc/dax/ccb.h [new file with mode: 0644]
arch/sparc/dax/dax_bip.c [new file with mode: 0644]
arch/sparc/dax/dax_debugfs.c [new file with mode: 0644]
arch/sparc/dax/dax_impl.h [new file with mode: 0644]
arch/sparc/dax/dax_main.c [new file with mode: 0644]
arch/sparc/dax/dax_misc.c [new file with mode: 0644]
arch/sparc/dax/dax_mm.c [new file with mode: 0644]
arch/sparc/dax/dax_perf.c [new file with mode: 0644]
arch/sparc/dax/sys_dax.h [new file with mode: 0644]
arch/sparc/include/asm/hypervisor.h

diff --git a/Documentation/sparc/dax.txt b/Documentation/sparc/dax.txt
new file mode 100644 (file)
index 0000000..5d7f4fb
--- /dev/null
@@ -0,0 +1,256 @@
+Oracle Data Analytics Accelerator (DAX)
+---------------------------------------
+
+DAX is a coprocessor which resides on the SPARC M7 processor chip, and has
+direct access to the CPU's L3 caches as well as physical memory. It performs a
+handful of operations on data streams with various input and output formats.
+The driver is merely a transport mechanism and does not have knowledge of the
+various opcodes and data formats. A user space library provides high level
+services and translates these into low level commands which are then passed
+into the driver and subsequently the hypervisor and the coprocessor. This
+document describes the general flow of the driver, its structures, and its
+programmatic interface. It should be emphasized though that this interface is
+not intended for general use.  All applications using DAX should go through the
+user libraries.
+
+The DAX is documented in 3 places, though all are internal-only:
+ * Hypervisor API Wiki
+ * Virtual Machine Spec
+ * M7 PRM
+
+High Level Overview
+-------------------
+
+A coprocessor request is described by a Command Control Block (CCB). The CCB
+contains an opcode and various parameters. The opcode specifies what operation
+is to be done, and the parameters specify options, flags, sizes, and addresses.
+The CCB (or an array of CCBs) is passed to the Hypervisor, which handles
+queueing and scheduling of requests to the available coprocessor execution
+units. A status code returned indicates if the request was submitted
+successfully or if there was an error.  One of the addresses given in each CCB
+is a pointer to a "completion area", which is a 128 byte memory block that is
+written by the coprocessor to provide execution status. No interrupt is
+generated upon completion; the completion area must be polled by software to
+find out when a transaction has finished, but the M7 processor provides a
+mechanism to pause the virtual processor until the completion status has been
+updated by the coprocessor. A key feature of the DAX coprocessor design is that
+after a request is submitted, the kernel is no longer involved in the
+processing of it.  The polling is done at the user level, which results in
+almost zero latency between completion of a request and resumption of execution
+of the requesting thread.
+
+
+Addressing Memory
+-----------------
+
+The kernel does not have access to physical memory in the Sun4v architecture,
+as there is an additional level of memory virtualization present. This
+intermediate level is called "real" memory, and the kernel treats this as
+if it were physical.  The Hypervisor handles the translations between real
+memory and physical so that each logical domain (LDOM) can have a partition
+of physical memory that is isolated from that of other LDOMs.  When the
+kernel sets up a virtual mapping, it is a translation from a virtual
+address to a real address.
+
+The DAX coprocessor can only operate on _physical memory_, so before a request
+can be fed to the coprocessor, all the addresses in a CCB must be converted
+into physical addresses. The kernel cannot do this since it has no visibility
+into physical addresses. So a CCB may contain either the virtual or real
+addresses of the buffers or a combination of them. An "address type" field is
+available for each address that may be given in the CCB. In all cases, the
+Hypervisor will translate all the addresses to physical before dispatching to
+hardware.
+
+
+The Driver API
+--------------
+
+The driver provides most of its services via the ioctl() call. There is also
+some functionality provided via the mmap() call. These are the available ioctl
+functions:
+
+CCB_THR_INIT
+
+Creates a new context for a thread and initializes it for use. Each thread
+that wishes to submit requests must open the DAX device file and perform this
+ioctl.  This function causes a context structure to be allocated for the
+thread, which contains pointers and values used internally by the driver to
+keep track of submitted requests. A completion area buffer is also allocated,
+and this is large enough to contain the completion areas for many concurrent
+requests. The size of this buffer is returned to the caller since this is
+needed for the mmap() call so that the user can get access to the completion
+area buffer. Another value returned is the maximum length of the CCB array
+that may be submitted.
+
+CCB_THR_FINI
+
+Destroys a context for a thread. After doing this, the thread can no longer
+submit any requests.
+
+CA_DEQUEUE
+
+Notifies the driver that one or more completion areas are no longer needed and
+may be reused. This function must be performed whenever a thread has completed
+transactions that it has consumed. It need not be done after every transaction,
+but just often enough so that the completion areas do not run out.
+
+CCB_EXEC
+
+Submits one or more CCBs for execution on the coprocessor. An array of CCBs is
+given, along with the array length in bytes. The number of bytes actually
+accepted by the coprocessor is returned along with the offset of the completion
+area chosen for this set of submissions. This offset is relative to the start
+of the completion area virtual address given by a call to mmap() to the driver.
+
+There also several ioctl functions related to performance counters, but these
+are not described in this document. Access to the performance counters is
+provided via a utility program included with the DAX user libraries.
+
+MMAP
+
+The mmap() function provides two different services depending on
+whether or not PROT_WRITE is given.
+
+ - If a read-only mapping is requested, then the call is a request to
+   map the completion area buffer. In this case, the size requested
+   must equal the completion area size returned by the CCB_THR_INIT
+   ioctl call.
+ - If a read/write mapping is requested, then memory is allocated.
+   The memory is physically contiguous and locked. This memory can
+   be used for any virtual buffer in a CCB.
+
+
+Completion of a Request
+-----------------------
+
+The first byte in each completion area is the command status, and this byte is
+updated by the coprocessor hardware. Software may take advantage of special M7
+processor capabilities to efficiently poll this status byte.  First, a series
+of new address space identifiers has been introduced which can be used with a
+Load From Alternate Space instruction in order to effect a "monitored load".
+The typical ASI used would be 0x84, ASI_MONITOR_PRIMARY. Second, a new
+instruction, Monitored Wait (mwait) is introduced. It is just like /PAUSE/ in
+that it suspends execution of the virtual processor, but only until one of
+several events occur. If the block of data containing the monitored location is
+written to by any other virtual processor, then the mwait terminates. This
+allows software to resume execution immediately after a transaction completes,
+and without a context switch or kernel to user transition. The latency
+between transaction completion and resumption of execution may thus be
+just a few nanoseconds.
+
+
+Life cycle of a DAX Submission
+------------------------------
+
+ - Application opens dax device
+ - calls the CCB_THR_INIT ioctl
+ - invokes mmap() to get the completion area address
+ - optionally use mmap to allocate memory buffers for the request
+ - allocate a CCB and fill in the opcode, flags, parameter, addresses, etc.
+ - call the CCB_EXEC ioctl
+ - go into a loop executing monitored load + monitored wait and
+   terminate when the command status indicates the request is complete
+ - call the CA_DEQUEUE ioctl to release the completion area
+ - call munmap to deallocate completion area and any other memory
+ - call the CCB_THR_FINI ioctl
+ - close the dax device
+
+
+Memory Constraints
+------------------
+
+The DAX hardware operates only on physical addresses. Therefore, it is not
+aware of virtual memory mappings and the discontiguities that may exist in the
+physical memory that a virtual buffer maps to. There is no I/O TLB nor any kind
+of scatter/gather mechanism. Any data passed to DAX must reside in a physically
+contiguous region of memory.
+
+As stated earlier, the Hypervisor translates all addresses within a CCB to
+physical before handing off the CCB to DAX. The Hypervisor determines the
+virtual page size for each virtual address given, and uses this to program a
+size limit for each addresses. This prevents the coprocessor from reading or
+writing beyond the bound of the virtual page, even though it is accessing
+physical memory directly. A simpler way of saying this is that DAX will not
+"cross" a virtual page boundary. If an 8k virtual page is used, then the data
+is strictly limited to 8k. If a user's buffer is larger than 8k, then a larger
+page size must be used, or the transaction size will still be limited to 8k.
+There are two ways of accomplishing this.
+
+Huge pages. A user may allocate huge pages using either the mmap or shmget
+interfaces. Memory buffers residing on huge pages may be used to achieve much
+larger DAX transaction sizes, but the rules must still be followed, and no
+transaction can cross a page boundary, even a huge page.  A major caveat is
+that Linux on Sparc presents 8Mb as one of the huge page sizes. Sparc does not
+actually provide a 8Mb hardware page size, and this size is synthesized by
+pasting together two 4Mb pages. The reasons for this are historical, and it
+creates an issue because only half of this 8Mb page can actually be used for
+any given buffer in a DAX request, and it must be either the first half or the
+second half; it cannot be a 4Mb chunk in the middle, since that crosses a page
+boundary.
+
+DAX memory. The driver provides a memory allocation mechanism which guarantees
+that the backing physical memory is contiguous. A call to mmap requests an
+allocation, and the virtual address returned to the user is backed by mappings
+to 8k pages. However, when any address within one of these allocations is used
+in a DAX request, the driver replaces the user virtual address with the real
+address of the backing memory, and utilizes the DAX _flow control_ mechanism
+(if available) to specify a size limit on the memory buffer. This kind of
+allocation is called a "synthetic large page" because the driver can "create"
+pages of arbitrary size that do not depend on the hardware page sizes.
+
+Note. The synthetic large pages are only supported on some versions of the M7
+cpu, and an alternate technique is employed on the other versions: a mmap call
+may only request exactly 4Mb, and again, a contiguous physical allocation is
+used, and 8k pages are used for the user mappings to this area, while inside
+the kernel, a 4Mb virtual page is actually used. Similar to the synthetic large
+page "translation", when a user gives one of these addresses in a ccb, the
+driver replaces it with the corresponding kernel virtual address. Then the
+Hypervisor will sense the 4Mb virtual page size to complete the logic.
+
+
+Organization of the Driver Source
+---------------------------------
+
+The driver is split into several files based on the general area of
+functionality provided:
+
+ * dax_main.c - attach/detach, open/close, ioctl, thread init/fini functions,
+   context allocation, ccb submit/dequeue
+ * dax_mm.c   - memory allocation, mapping, and locking/unlocking
+ * dax_debugfs.c - support for debugfs access
+ * dax_bip.c  - utility functions to handle BIP buffers, used to track outstanding CCBs
+ * dax_perf.c - performance counter functions
+ * ccb.h - internal structure of a CCB and completion area
+ * sys_dax.h  - ioctl definitions and structures
+ * dax_impl.h - driver internal macros and structures
+
+
+Data Structures used by the Driver
+----------------------------------
+
+ * BIP Buffer - A variant of a circular buffer that returns variable length
+   contiguous blocks
+ * Context - a per thread structure that holds the state of CCBs submitted by
+   the thread
+ * dax_mm -  a structure that describes one memory management context, i.e., a
+   list of dax contexts belonging to the threads in a process
+ * dax_vma - a structure that describes one memory allocation
+
+
+Note on Memory Unmap Operations
+-------------------------------
+
+The multi threaded architecture of applications means that multiple threads
+have access to, and control over memory that is being used for DAX operations.
+It is the responsibility of the user to ensure that proper synchronization
+occurs among multiple threads accessing memory that may be accessed by DAX. But
+the driver has to protect against a thread releasing memory that may be in use
+by DAX, as freed memory might be immediately reallocated somewhere else, to
+another process, or to another kernel entity, and DAX might still be reading or
+writing to this memory. This is a hard problem to solve because there is no
+easy way to find out if a particular memory region is currently in use by DAX.
+This can only be done by a search of all outstanding transactions for memory
+addresses that fall within range of memory allocation being freed. Hence, a
+memory unmap operation will wait for all DAX operations using that memory to
+complete.
+
index 2939fffbd39cde1ba5ee7750bcbd7b28d2888999..04280f718bcfcb550b8ed9f87c5719e46d1b1d24 100644 (file)
@@ -198,6 +198,12 @@ config NR_CPUS
        default 32 if SPARC32
        default 2048 if SPARC64
 
+config SPARC_DAX
+       bool "Enable Oracle Sparc DAX driver"
+       def_bool m if SPARC64
+       ---help---
+               This enables Oracle Data Analytics Accelerator (DAX) driver
+
 source kernel/Kconfig.hz
 
 config RWSEM_GENERIC_SPINLOCK
index 303a0c8c9b55de22b769353d72e02f21fae71143..3475ef9615b51f3abc3467f05131b1454718d53d 100644 (file)
@@ -59,6 +59,7 @@ libs-y                 += arch/sparc/lib/
 
 drivers-$(CONFIG_PM) += arch/sparc/power/
 drivers-$(CONFIG_OPROFILE)     += arch/sparc/oprofile/
+drivers-$(CONFIG_SPARC_DAX)    += arch/sparc/dax/
 
 boot := arch/sparc/boot
 
diff --git a/arch/sparc/dax/Makefile b/arch/sparc/dax/Makefile
new file mode 100644 (file)
index 0000000..373309d
--- /dev/null
@@ -0,0 +1,4 @@
+obj-m                                   += dax.o
+
+dax-y                                   := dax_main.o dax_mm.o dax_perf.o \
+                                          dax_bip.o dax_misc.o dax_debugfs.o
diff --git a/arch/sparc/dax/ccb.h b/arch/sparc/dax/ccb.h
new file mode 100644 (file)
index 0000000..d1b5faf
--- /dev/null
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#ifndef        _CCB_H
+#define        _CCB_H
+
+/* CCB address types */
+#define        CCB_AT_IMM              0       /* immediate */
+#define        CCB_AT_VA               3       /* virtual address */
+#ifdef __KERNEL__
+#define        CCB_AT_VA_ALT           1       /* only kernel can use
+                                        * secondary context
+                                        */
+#define        CCB_AT_RA               2       /* only kernel can use real address */
+#endif /* __KERNEL__ */
+
+#define        CCB_AT_COMPL_MASK       0x3
+#define        CCB_AT_SRC0_MASK        0x7
+#define        CCB_AT_SRC1_MASK        0x7
+#define        CCB_AT_DST_MASK         0x7
+#define        CCB_AT_TBL_MASK         0x3
+
+#define        CCB_AT_COMPL_SHIFT      32
+#define        CCB_AT_SRC0_SHIFT       34
+
+/* CCB header sync flags */
+#define        CCB_SYNC_SERIAL         BIT(0)
+#define        CCB_SYNC_COND           BIT(1)
+#define        CCB_SYNC_LONGCCB        BIT(2)
+
+#define        CCB_SYNC_FLG_SHIFT      24
+#define        CCB_HDR_SHIFT           32
+
+#define        CCB_DW1_INTR_SHIFT      59
+
+#define        DAX_BUF_LIMIT_FLOW_CTL  2
+#define        DAX_EXT_OP_ENABLE       1
+
+/* CCB L3 output allocation */
+#define        CCB_OUTPUT_ALLOC_NONE   0       /* do not allocate in L3 */
+#define        CCB_OUTPUT_ALLOC_HARD   1       /* allocate in L3 of running cpu */
+#define        CCB_OUTPUT_ALLOC_SOFT   2       /* allocate to whichever L3 owns */
+                                       /* line, else L3 of running cpu */
+
+#define        CCB_LOCAL_ADDR_SHIFT    6
+#define        CCB_LOCAL_ADDR(x, mask) (((x) & mask) >> CCB_LOCAL_ADDR_SHIFT)
+
+#define        CCB_DWORD_CTL           0
+#define        CCB_DWORD_COMPL         1
+
+#define        QUERY_DWORD_INPUT       2
+#define        QUERY_DWORD_DAC         3
+#define        QUERY_DWORD_SEC_INPUT   4
+#define        QUERY_DWORD_OUTPUT      6
+#define        QUERY_DWORD_TBL         7
+
+
+#define        BIT_MASK64(_hi, _lo)    (((u64)((~(u64)0)>>(63-(_hi)))) & \
+       ((u64)((~(u64)0)<<(_lo))))
+
+#define        CCB_GET(s, dword)       (((dword) & CCB_##s##_MASK) >> CCB_##s##_SHIFT)
+
+#define        CCB_SET(s, val, dword)                          \
+       ((dword) = ((dword) & ~CCB_##s##_MASK) |        \
+       ((((val) << CCB_##s##_SHIFT)) & CCB_##s##_MASK))
+
+#define        CCB_QUERY_INPUT_VA_MASK         BIT_MASK64(53, 0)
+#define        CCB_QUERY_INPUT_VA_SHIFT        0
+
+#define        CCB_QUERY_INPUT_PA_MASK         BIT_MASK64(55, 0)
+#define        CCB_QUERY_INPUT_PA_SHIFT        0
+
+#define        CCB_QUERY_SEC_INPUT_VA_MASK     CCB_QUERY_INPUT_VA_MASK
+#define        CCB_QUERY_SEC_INPUT_VA_SHIFT    CCB_QUERY_INPUT_VA_SHIFT
+
+#define        CCB_QUERY_SEC_INPUT_PA_MASK     CCB_QUERY_INPUT_PA_MASK
+#define        CCB_QUERY_SEC_INPUT_PA_SHIFT    CCB_QUERY_INPUT_PA_SHIFT
+
+#define        CCB_COMPL_VA(dw)                CCB_GET(COMPL_VA, (dw))
+
+#define        CCB_QUERY_INPUT_VA(dw)  CCB_GET(QUERY_INPUT_VA, (dw))
+#define        CCB_QUERY_SEC_INPUT_VA(dw)      CCB_GET(QUERY_SEC_INPUT_VA, (dw))
+#define        CCB_QUERY_OUTPUT_VA(dw) CCB_GET(QUERY_OUTPUT_VA, (dw))
+#define        CCB_QUERY_TBL_VA(dw)            CCB_GET(QUERY_TBL_VA, (dw))
+
+#define        CCB_SET_COMPL_PA(pa, dw)        CCB_SET(COMPL_PA, (pa), (dw))
+
+#define        CCB_SET_QUERY_INPUT_PA(pa, dw)  CCB_SET(QUERY_INPUT_PA, (pa), (dw))
+#define        CCB_SET_QUERY_SEC_INPUT_PA(pa, dw)      \
+       CCB_SET(QUERY_SEC_INPUT_PA, (pa), (dw))
+#define        CCB_SET_QUERY_OUTPUT_PA(pa, dw) CCB_SET(QUERY_OUTPUT_PA, (pa), (dw))
+#define        CCB_SET_QUERY_TBL_PA(pa, dw)    CCB_SET(QUERY_TBL_PA, (pa), (dw))
+
+/* max number of VA bits that can be specified in CCB */
+#define        CCB_VA_NBITS                    54
+
+#define CCB_VA_SIGN_EXTEND(va) va
+
+#define CCB_COMPL_PA_MASK              BIT_MASK64(55, 6)
+#define CCB_COMPL_PA_SHIFT             0
+
+/*
+ * Query CCB opcodes
+ */
+#define        CCB_QUERY_OPCODE_SYNC_NOP       0x0
+#define        CCB_QUERY_OPCODE_EXTRACT        0x1
+#define        CCB_QUERY_OPCODE_SCAN_VALUE     0x2
+#define        CCB_QUERY_OPCODE_SCAN_RANGE     0x3
+#define        CCB_QUERY_OPCODE_TRANSLATE      0x4
+#define        CCB_QUERY_OPCODE_SELECT         0x5
+#define        CCB_QUERY_OPCODE_INV_SCAN_VALUE 0x12
+#define        CCB_QUERY_OPCODE_INV_SCAN_RANGE 0x13
+#define        CCB_QUERY_OPCODE_INV_TRANSLATE  0x14
+
+/* Query primary input formats */
+#define        CCB_QUERY_IFMT_FIX_BYTE         0       /* to 16 bytes */
+#define        CCB_QUERY_IFMT_FIX_BIT          1       /* to 15 bits */
+#define        CCB_QUERY_IFMT_VAR_BYTE         2       /* separate length stream */
+#define        CCB_QUERY_IFMT_FIX_BYTE_RLE     4       /* to 16 bytes + RL stream */
+#define        CCB_QUERY_IFMT_FIX_BIT_RLE      5       /* to 15 bits + RL stream */
+#define        CCB_QUERY_IFMT_FIX_BYTE_HUFF    8       /* to 16 bytes */
+#define        CCB_QUERY_IFMT_FIX_BIT_HUFF     9       /* to 15 bits */
+#define        CCB_QUERY_IFMT_VAR_BYTE_HUFF    10      /* separate length stream */
+#define        CCB_QUERY_IFMT_FIX_BYTE_RLE_HUFF 12     /* to 16 bytes + RL stream */
+#define        CCB_QUERY_IFMT_FIX_BIT_RLE_HUFF 13      /* to 15 bits + RL stream */
+
+/* Query secondary input size */
+#define        CCB_QUERY_SZ_ONEBIT             0
+#define        CCB_QUERY_SZ_TWOBIT             1
+#define        CCB_QUERY_SZ_FOURBIT            2
+#define        CCB_QUERY_SZ_EIGHTBIT           3
+
+/* Query secondary input encoding */
+#define        CCB_QUERY_SIE_LESS_ONE          0
+#define        CCB_QUERY_SIE_ACTUAL            1
+
+/* Query output formats */
+#define        CCB_QUERY_OFMT_BYTE_ALIGN       0
+#define        CCB_QUERY_OFMT_16B              1
+#define        CCB_QUERY_OFMT_BIT_VEC          2
+#define        CCB_QUERY_OFMT_ONE_IDX          3
+
+/* Query operand size constants */
+#define        CCB_QUERY_OPERAND_DISABLE       31
+
+/* Query Data Access Control input length format */
+#define        CCB_QUERY_ILF_SYMBOL            0
+#define        CCB_QUERY_ILF_BYTE              1
+#define        CCB_QUERY_ILF_BIT               2
+
+/* Completion area cmd_status */
+#define        CCB_CMD_STAT_NOT_COMPLETED      0
+#define        CCB_CMD_STAT_COMPLETED          1
+#define        CCB_CMD_STAT_FAILED             2
+#define        CCB_CMD_STAT_KILLED             3
+#define        CCB_CMD_STAT_NOT_RUN            4
+#define        CCB_CMD_STAT_NO_OUTPUT          5
+
+/* Completion area err_mask of user visible errors */
+#define        CCB_CMD_ERR_BOF                 0x1     /* buffer overflow */
+#define        CCB_CMD_ERR_DECODE              0x2     /* CCB decode error */
+#define        CCB_CMD_ERR_POF                 0x3     /* page overflow */
+#define        CCB_CMD_ERR_RSVD1               0x4     /* Reserved */
+#define        CCB_CMD_ERR_RSVD2               0x5     /* Reserved */
+#define        CCB_CMD_ERR_KILL                0x7     /* command was killed */
+#define        CCB_CMD_ERR_TO                  0x8     /* command timeout */
+#define        CCB_CMD_ERR_MCD                 0x9     /* MCD error */
+#define        CCB_CMD_ERR_DATA_FMT            0xA     /* data format error */
+#define        CCB_CMD_ERR_OTHER               0xF     /* error not visible to user */
+
+struct ccb_hdr {
+       u32     ccb_ver:4;      /* must be set to 0 for M7 HW */
+       u32     sync_flags:4;
+       u32     opcode:8;
+       u32     rsvd:3;
+       u32     at_tbl:2;       /* IMM/RA(kernel)/VA*/
+       u32     at_dst:3;       /* IMM/RA(kernel)/VA*/
+       u32     at_src1:3;      /* IMM/RA(kernel)/VA*/
+       u32     at_src0:3;      /* IMM/RA(kernel)/VA*/
+#ifdef __KERNEL__
+       u32     at_cmpl:2;      /* IMM/RA(kernel)/VA*/
+#else
+       u32     rsvd2:2;        /* only kernel can specify at_cmpl */
+#endif /* __KERNEL__ */
+};
+
+struct ccb_addr {
+       u64     adi:4;
+       u64     rsvd:4;
+       u64     addr:50;        /* [55:6] of 64B aligned address */
+                                       /* if VA, [55:54] must be 0 */
+       u64     rsvd2:6;
+};
+
+struct ccb_byte_addr {
+       u64     adi:4;
+       u64     rsvd:4;
+       u64     addr:56;        /* [55:0] of byte aligned address */
+                                       /* if VA, [55:54] must be 0 */
+};
+
+struct ccb_tbl_addr {
+       u64     adi:4;
+       u64     rsvd:4;
+       u64     addr:50;        /* [55:6] of 64B aligned address */
+                                       /* if VA, [55:54] must be 0 */
+       u64     rsvd2:4;
+       u64     vers:2;         /* version number */
+};
+
+struct ccb_cmpl_addr {
+       u64     adi:4;
+       u64     intr:1;         /* Interrupt not supported */
+#ifdef __KERNEL__
+       u64     rsvd:3;
+       u64     addr:50;        /* [55:6] of 64B aligned address */
+                                       /* if VA, [55:54] must be 0 */
+       u64     rsvd2:6;
+#else
+       u64     rsvd:59;        /* Only kernel can specify completion */
+                                       /* address in CCB.  User must use */
+                                       /* offset to mmapped kernel memory. */
+#endif /* __KERNEL__ */
+};
+
+struct ccb_sync_nop_ctl {
+       struct ccb_hdr  hdr;
+       u32             ext_op:1;       /* extended op flag */
+       u32             rsvd:31;
+};
+
+/*
+ * CCB_QUERY_OPCODE_SYNC_NOP
+ */
+struct ccb_sync_nop {
+       struct ccb_sync_nop_ctl ctl;
+       struct ccb_cmpl_addr    completion;
+       u64                     rsvd[6];
+};
+
+/*
+ * Query CCB definitions
+ */
+
+struct ccb_extract_ctl {
+       struct ccb_hdr  hdr;
+       u32     src0_fmt:4;
+       u32     src0_sz:5;
+       u32     src0_off:3;
+       u32     src1_enc:1;
+       u32     src1_off:3;
+       u32     src1_sz:2;
+       u32     output_fmt:2;
+       u32     output_sz:2;
+       u32     pad_dir:1;
+       u32     rsvd:9;
+};
+
+struct ccb_data_acc_ctl {
+       u64     flow_ctl:2;
+       u64     pipeline_targ:2;
+       u64     output_buf_sz:20;
+       u64     rsvd:8;
+       u64     output_alloc:2;
+       u64     rsvd2:4;
+       u64     input_len_fmt:2;
+       u64     input_cnt:24;
+};
+
+/*
+ * CCB_QUERY_OPCODE_EXTRACT
+ */
+struct ccb_extract {
+       struct ccb_extract_ctl  control;
+       struct ccb_cmpl_addr    completion;
+       struct ccb_byte_addr    src0;
+       struct ccb_data_acc_ctl data_acc_ctl;
+       struct ccb_byte_addr    src1;
+       u64                     rsvd;
+       struct ccb_addr         output;
+       struct ccb_tbl_addr     tbl;
+};
+
+struct ccb_scan_bound {
+       u32     upper;
+       u32     lower;
+};
+
+/*
+ * CCB_QUERY_OPCODE_SCAN_VALUE
+ * CCB_QUERY_OPCODE_SCAN_RANGE
+ */
+struct ccb_scan {
+       struct ccb_extract_ctl  control;
+       struct ccb_cmpl_addr    completion;
+       struct ccb_byte_addr    src0;
+       struct ccb_data_acc_ctl data_acc_ctl;
+       struct ccb_byte_addr    src1;
+       struct ccb_scan_bound   bound_msw;
+       struct ccb_addr         output;
+       struct ccb_tbl_addr     tbl;
+};
+
+/*
+ * Scan Value/Range words 8-15 required when L or U operand size > 4 bytes.
+ */
+struct ccb_scan_ext {
+       struct ccb_scan_bound   bound_msw2;
+       struct ccb_scan_bound   bound_msw3;
+       struct ccb_scan_bound   bound_msw4;
+       u64             rsvd[5];
+};
+
+struct ccb_translate_ctl {
+       struct ccb_hdr  hdr;
+       u32     src0_fmt:4;
+       u32     src0_sz:5;
+       u32     src0_off:3;
+       u32     src1_enc:1;
+       u32     src1_off:3;
+       u32     src1_sz:2;
+       u32     output_fmt:2;
+       u32     output_sz:2;
+       u32     rsvd:1;
+       u32     test_val:9;
+};
+
+/*
+ * CCB_QUERY_OPCODE_TRANSLATE
+ */
+struct ccb_translate {
+       struct ccb_translate_ctl        control;
+       struct ccb_cmpl_addr            completion;
+       struct ccb_byte_addr            src0;
+       struct ccb_data_acc_ctl         data_acc_ctl;
+       struct ccb_byte_addr            src1;
+       u64                             rsvd;
+       struct ccb_addr                 dst;
+       struct ccb_tbl_addr             vec_addr;
+};
+
+struct ccb_select_ctl {
+       struct ccb_hdr  hdr;
+       u32             src0_fmt:4;
+       u32             src0_sz:5;
+       u32             src0_off:3;
+       u32             rsvd:1;
+       u32             src1_off:3;
+       u32             rsvd2:2;
+       u32             output_fmt:2;
+       u32             output_sz:2;
+       u32             pad_dir:1;
+       u32             rsvd3:9;
+};
+
+/*
+ * CCB_QUERY_OPCODE_SELECT
+ */
+struct ccb_select {
+       struct ccb_select_ctl   control;
+       struct ccb_cmpl_addr    completion;
+       struct ccb_byte_addr    src0;
+       struct ccb_data_acc_ctl data_acc_ctl;
+       struct ccb_byte_addr    src1;
+       u64                     rsvd;
+       struct ccb_addr         output;
+       struct ccb_tbl_addr     tbl;
+};
+
+union ccb {
+       struct ccb_sync_nop     sync_nop;
+       struct ccb_extract      extract;
+       struct ccb_scan         scan;
+       struct ccb_scan_ext     scan_ext;
+       struct ccb_translate    translate;
+       struct ccb_select       select;
+       u64                     dwords[8];
+};
+
+struct ccb_completion_area {
+       u8      cmd_status;     /* user may mwait on this address */
+       u8      err_mask;       /* user visible error notification */
+       u8      rsvd[2];        /* reserved */
+       u32     rsvd2;          /* reserved */
+       u32     output_sz;      /* Bytes of output */
+       u32     rsvd3;          /* reserved */
+       u64     run_time;       /* run time in OCND2 cycles */
+       u64     run_stats;      /* nothing reported in version 1.0 */
+       u32     n_processed;    /* input elements processed */
+       u32     rsvd4[5];       /* reserved */
+       u64     command_rv;     /* command return value */
+       u64     rsvd5[8];       /* reserved */
+};
+
+#endif /* _CCB_H */
diff --git a/arch/sparc/dax/dax_bip.c b/arch/sparc/dax/dax_bip.c
new file mode 100644 (file)
index 0000000..9314f51
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+
+/*
+ * CCB buffer management
+ *
+ * A BIP-Buffer is used to track the outstanding CCBs.
+ *
+ * A BIP-Buffer is a well-known variant of a circular buffer that
+ * returns variable length contiguous blocks.  The buffer is split
+ * into two regions, A and B.  The buffer starts with a single region A.
+ * When there is more space before region A than after, a new region B
+ * is created and future allocations come from region B.  When region A
+ * is completely deallocated, region B if in use is renamed to region A.
+ */
+static void dbg_bip_state(struct dax_ctx *ctx)
+{
+       dax_dbg("a_start=%d a_end=%d, b_end=%d, resv_start=%d, resv_end=%d, bufcnt=%d",
+               ctx->a_start, ctx->a_end, ctx->b_end,  ctx->resv_start,
+               ctx->resv_end, ctx->bufcnt);
+}
+
+/*
+ * Reserves space in the bip buffer for the user ccbs.  Returns amount reserved
+ * which may be less than requested len.
+ *
+ * If region B exists, then allocate from region B regardless of region A
+ * freespace.  Else, compare freespace before and after region A.  If more space
+ * before, then create new region B.
+ */
+union ccb *dax_ccb_buffer_reserve(struct dax_ctx *ctx, size_t len,
+                                 size_t *reserved)
+{
+       size_t avail;
+
+       /* allocate from region B if B exists */
+       if (ctx->b_end > 0) {
+               avail = ctx->a_start - ctx->b_end;
+
+               if (avail > len)
+                       avail = len;
+
+               if (avail == 0)
+                       return NULL;
+
+               *reserved = avail;
+               ctx->resv_start = ctx->b_end;
+               ctx->resv_end = ctx->b_end + avail;
+
+               dax_dbg("region B reserve: reserved=%ld, resv_start=%d, resv_end=%d, ccb_bufp=0x%p",
+                       *reserved, ctx->resv_start, ctx->resv_end,
+                       (void *)((caddr_t *)(ctx->ccb_buf) + ctx->resv_start));
+       } else {
+
+               /*
+                * region A allocation. Check if there is more freespace after
+                * region A than before region A.  Allocate from the larger.
+                */
+               avail = ctx->ccb_buflen - ctx->a_end;
+
+               if (avail >= ctx->a_start) {
+                       /* more freespace after region A */
+
+                       if (avail == 0)
+                               return NULL;
+
+                       if (avail > len)
+                               avail = len;
+
+                       *reserved = avail;
+                       ctx->resv_start = ctx->a_end;
+                       ctx->resv_end = ctx->a_end + avail;
+
+                       dax_dbg("region A (after) reserve: reserved=%ld, resv_start=%d, resv_end=%d, ccb_bufp=0x%p",
+                               *reserved, ctx->resv_start, ctx->resv_end,
+                               (void *)((caddr_t)(ctx->ccb_buf) +
+                               ctx->resv_start));
+               } else {
+                       /* before region A */
+                       avail = ctx->a_start;
+
+                       if (avail == 0)
+                               return NULL;
+
+                       if (avail > len)
+                               avail = len;
+
+                       *reserved = avail;
+                       ctx->resv_start = 0;
+                       ctx->resv_end = avail;
+
+                       dax_dbg("region A (before) reserve: reserved=%ld, resv_start=%d, resv_end=%d, ccb_bufp=0x%p",
+                               *reserved, ctx->resv_start, ctx->resv_end,
+                               (void *)((caddr_t)(ctx->ccb_buf) +
+                               ctx->resv_start));
+               }
+       }
+
+       dbg_bip_state(ctx);
+
+       return ((union ccb *)((caddr_t)(ctx->ccb_buf) + ctx->resv_start));
+}
+
+/* Marks the BIP region as used */
+void dax_ccb_buffer_commit(struct dax_ctx *ctx, size_t len)
+{
+       if (ctx->resv_start == ctx->a_end)
+               ctx->a_end += len;
+       else
+               ctx->b_end += len;
+
+       ctx->resv_start = 0;
+       ctx->resv_end = 0;
+       ctx->bufcnt += len;
+
+       dbg_bip_state(ctx);
+}
+
+/*
+ * Return index to oldest contig block in buffer, or -1 if empty.
+ * In either case, len is set to size of oldest contig block (which may be 0).
+ */
+int dax_ccb_buffer_get_contig_ccbs(struct dax_ctx *ctx, int *len_ccb)
+{
+       if (ctx->a_end == 0) {
+               *len_ccb = 0;
+               return -1;
+       }
+
+       *len_ccb = CCB_BYTE_TO_NCCB(ctx->a_end - ctx->a_start);
+       return CCB_BYTE_TO_NCCB(ctx->a_start);
+}
+
+/*
+ * Returns amount of contiguous memory decommitted from buffer.
+ *
+ * Note: If both regions are currently in use, it will only free the memory in
+ * region A. If the amount returned to the pool is less than len, there may be
+ * more memory left in buffer.   Caller may need to make multiple calls to
+ * decommit all memory in buffer.
+ */
+void dax_ccb_buffer_decommit(struct dax_ctx *ctx, int n_ccb)
+{
+       size_t a_len;
+       size_t len = NCCB_TO_CCB_BYTE(n_ccb);
+
+       a_len = ctx->a_end - ctx->a_start;
+
+       if (len >= a_len) {
+               len = a_len;
+               ctx->a_start = 0;
+               ctx->a_end = ctx->b_end;
+               ctx->b_end = 0;
+       } else {
+               ctx->a_start += len;
+       }
+
+       ctx->bufcnt -= len;
+
+       dbg_bip_state(ctx);
+       dax_dbg("decommited len=%ld", len);
+}
+
+
diff --git a/arch/sparc/dax/dax_debugfs.c b/arch/sparc/dax/dax_debugfs.c
new file mode 100644 (file)
index 0000000..da5fb81
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+#include <linux/debugfs.h>
+
+static struct dentry *dax_dbgfs;
+static struct dentry *dax_output;
+
+enum dax_dbfs_type {
+       DAX_DBFS_MEM_USAGE,
+       DAX_DBFS_ALLOC_COUNT,
+};
+
+static int debug_open(struct inode *inode, struct file *file);
+
+static const struct file_operations debugfs_ops = {
+       .open = debug_open,
+       .release = single_release,
+       .read = seq_read,
+       .llseek = seq_lseek,
+};
+
+static int dax_debugfs_read(struct seq_file *s, void *data)
+{
+       switch ((long)s->private) {
+       case DAX_DBFS_MEM_USAGE:
+               seq_printf(s, "memory use (Kb): %d\n",
+                          atomic_read(&dax_requested_mem));
+               break;
+       case DAX_DBFS_ALLOC_COUNT:
+               seq_printf(s, "DAX alloc count: %d\n",
+                          atomic_read(&dax_alloc_counter));
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, dax_debugfs_read, inode->i_private);
+}
+
+void dax_debugfs_init(void)
+{
+       dax_dbgfs = debugfs_create_dir("dax", NULL);
+       if (dax_dbgfs == NULL) {
+               dax_err("dax debugfs dir creation failed");
+               return;
+       }
+
+       dax_output = debugfs_create_file("mem_usage", 0444, dax_dbgfs,
+                                        (void *)DAX_DBFS_MEM_USAGE,
+                                        &debugfs_ops);
+       if (dax_output == NULL)
+               dax_err("dax debugfs output file creation failed");
+
+       dax_output = debugfs_create_file("alloc_count", 0444, dax_dbgfs,
+                                        (void *)DAX_DBFS_ALLOC_COUNT,
+                                        &debugfs_ops);
+       if (dax_output == NULL)
+               dax_err("dax debugfs output file creation failed");
+}
+
+void dax_debugfs_clean(void)
+{
+       if (dax_dbgfs != NULL)
+               debugfs_remove_recursive(dax_dbgfs);
+}
diff --git a/arch/sparc/dax/dax_impl.h b/arch/sparc/dax/dax_impl.h
new file mode 100644 (file)
index 0000000..f277c4e
--- /dev/null
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#ifndef _DAX_IMPL_H
+#define _DAX_IMPL_H
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/bug.h>
+#include <linux/hugetlb.h>
+#include <linux/nodemask.h>
+#include <linux/bug.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/hypervisor.h>
+#include <asm/pgtable.h>
+#include <asm/mdesc.h>
+#include <asm/atomic.h>
+#include "ccb.h"
+#include "sys_dax.h"
+
+extern bool dax_no_flow_ctl;
+extern int dax_debug;
+extern atomic_t dax_alloc_counter;
+extern atomic_t dax_actual_mem;
+extern atomic_t dax_requested_mem;
+extern int dax_peak_waste;
+extern spinlock_t dm_list_lock;
+extern const struct vm_operations_struct dax_vm_ops;
+
+#define DAX_BIP_MAX_CONTIG_BLOCKS      2
+#define FORCE_LOAD_ON_ERROR            0x1
+#define FORCE_LOAD_ON_NO_FLOW_CTL      0x2
+
+#define        DAX_DBG_FLG_BASIC       0x01
+#define        DAX_DBG_FLG_DRV         0x02
+#define        DAX_DBG_FLG_MAP         0x04
+#define DAX_DBG_FLG_LIST       0x08
+#define DAX_DBG_FLG_PERF       0x10
+#define DAX_DBG_FLG_NOMAP      0x20
+#define        DAX_DBG_FLG_ALL         0xff
+
+#define dax_info(fmt, ...)     pr_info("%s: " fmt "\n", __func__,\
+                                       ##__VA_ARGS__)
+#define dax_err(fmt, ...)      pr_err("%s: " fmt "\n", __func__, ##__VA_ARGS__)
+#define dax_alert(fmt, ...)    pr_alert("%s: " fmt "\n", __func__,\
+                                       ##__VA_ARGS__)
+#define dax_warn(fmt, ...)     pr_warn("%s: " fmt "\n", __func__,\
+                                       ##__VA_ARGS__)
+
+#define        dax_dbg(fmt, ...)       do {\
+                                       if (dax_debug & DAX_DBG_FLG_BASIC)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+#define        dax_drv_dbg(fmt, ...)   do {\
+                                       if (dax_debug & DAX_DBG_FLG_DRV)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+#define        dax_map_dbg(fmt, ...)   do {\
+                                       if (dax_debug & DAX_DBG_FLG_MAP)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+#define        dax_list_dbg(fmt, ...)  do {\
+                                       if (dax_debug & DAX_DBG_FLG_LIST)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+#define        dax_perf_dbg(fmt, ...)  do {\
+                                       if (dax_debug & DAX_DBG_FLG_PERF)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+#define        dax_nomap_dbg(fmt, ...) do {\
+                                       if (dax_debug & DAX_DBG_FLG_NOMAP)\
+                                               dax_info(fmt, ##__VA_ARGS__);\
+                               } while (0)
+
+#define DAX_VALIDATE_AT(hdr, type, label)                              \
+       do {                                                            \
+               if (!((hdr)->at_##type == CCB_AT_VA ||                  \
+                   (hdr)->at_##type == CCB_AT_IMM)) {                  \
+                       dax_err(                                        \
+                       "invalid at_##type address type (%d) in user CCB", \
+                               (hdr)->at_##type);                      \
+                       goto label;                                     \
+               }                                                       \
+       } while (0)
+
+#define        DAX_NAME                "dax"
+#define DAX_MINOR              1UL
+#define DAX_MAJOR              1UL
+
+#define DAX1_STR    "ORCL,sun4v-dax"
+#define DAX1_FC_STR "ORCL,sun4v-dax-fc"
+#define DAX2_STR    "ORCL,sun4v-dax2"
+
+#define CCB_BYTE_TO_NCCB(a)    ((a) / sizeof(union ccb))
+#define NCCB_TO_CCB_BYTE(a)    ((a) * sizeof(union ccb))
+#define CA_BYTE_TO_NCCB(a)     ((a) / sizeof(struct ccb_completion_area))
+#define NCCB_TO_CA_BYTE(a)     ((a) * sizeof(struct ccb_completion_area))
+
+#ifndef U16_MAX
+#define U16_MAX 65535
+#endif
+#define DAX_NOMAP_RETRIES      3
+#define DAX_DEFAULT_MAX_CCB    15
+#define DAX_SYN_LARGE_PAGE_SIZE        (4*1024*1024UL)
+#define        DAX_CCB_BUF_SZ          PAGE_SIZE
+#define        DAX_CCB_BUF_NELEMS      (DAX_CCB_BUF_SZ / sizeof(union ccb))
+
+#define        DAX_CA_BUF_SZ           (DAX_CCB_BUF_NELEMS * \
+                                sizeof(struct ccb_completion_area))
+
+#define        DAX_MMAP_SZ             DAX_CA_BUF_SZ
+#define        DAX_MMAP_OFF            (off_t)(0x0)
+
+#define        DWORDS_PER_CCB          8
+
+#define        CCB_HDR(ccb)            ((struct ccb_hdr *)(ccb))
+#define        IS_LONG_CCB(ccb)        ((CCB_HDR(ccb))->sync_flags & CCB_SYNC_LONGCCB)
+
+#define        DAX_CCB_WAIT_USEC               100
+#define        DAX_CCB_WAIT_RETRIES_MAX        10000
+
+#define DAX_OUT_SIZE_FROM_CCB(sz)      ((1 + (sz)) * 64)
+#define DAX_IN_SIZE_FROM_CCB(sz)               (1 + (sz))
+
+/* Dax PERF registers */
+#define DAX_PERF_CTR_CTL                       171
+#define DAX_PERF_CTR_0                         168
+#define DAX_PERF_CTR_1                         169
+#define DAX_PERF_CTR_2                         170
+#define DAX_PERF_REG_OFF(num, reg, node, dax) \
+               (((reg) + (num)) + ((node) * 196) + ((dax) * 4))
+#define DAX_PERF_CTR_CTL_OFFSET(node, dax) \
+               DAX_PERF_REG_OFF(0, DAX_PERF_CTR_CTL, (node), (dax))
+#define DAX_PERF_CTR_OFFSET(num, node, dax) \
+               DAX_PERF_REG_OFF(num, DAX_PERF_CTR_0, (node), (dax))
+
+/* dax flow control test constants */
+#define DAX_FLOW_LIMIT         64UL
+#define        DAX_INPUT_ELEMS         64
+#define        DAX_INPUT_ELEM_SZ       1
+#define        DAX_OUTPUT_ELEMS        64
+#define        DAX_OUTPUT_ELEM_SZ      2
+
+enum dax_types {
+       DAX1,
+       DAX2
+};
+
+/* dax address type */
+enum dax_at {
+       AT_DST,
+       AT_SRC0,
+       AT_SRC1,
+       AT_TBL,
+       AT_MAX
+};
+
+/*
+ * Per mm dax structure. Thread contexts related to a
+ * mm are added to the ctx_list. Each instance of these dax_mms
+ * are maintained in a global dax_mm_list
+ */
+struct dax_mm {
+       struct list_head        mm_list;
+       struct list_head        ctx_list;
+       struct mm_struct        *this_mm;
+       spinlock_t              lock;
+       int                     vma_count;
+       int                     ctx_count;
+};
+
+/*
+ * Per vma dax structure. This is stored in the vma
+ * private pointer.
+ */
+struct dax_vma {
+       struct dax_mm           *dax_mm;
+       struct vm_area_struct   *vma;
+       void                    *kva;   /* kernel virtual address */
+       unsigned long           pa;     /* physical address */
+       size_t                  length;
+};
+
+
+/*
+ * DAX per thread CCB context structure
+ *
+ * *owner : pointer to thread that owns this ctx
+ * ctx_list : to add this struct to a linked list
+ * *dax_mm : pointer to per process dax mm
+ * *ccb_buf : CCB buffer
+ * ccb_buf_ra : cached RA of CCB
+ * **pages : pages for CCBs
+ * *ca_buf : CCB completion area (CA) buffer
+ * ca_buf_ra : cached RA of completion area
+ * ccb_buflen : CCB buffer length in bytes
+ * ccb_submit_maxlen : max user ccb byte len per call
+ * ca_buflen : Completion area buffer length in bytes
+ * a_start : Start of region A of BIP buffer
+ * a_end : End of region A of BIP buffer
+ * b_end : End of region B of BIP buffer.
+ *          region B always starts at 0
+ * resv_start : Start of memory reserved in BIP buffer, set by
+ *     dax_ccb_buffer_reserve and cleared by dax_ccb_buffer_commit
+ * resv_end : End of memory reserved in BIP buffer, set by
+ *     dax_ccb_buffer_reserve and cleared by dax_ccb_buffer_commit
+ * bufcnt : Number of bytes currently used by the BIP buffer
+ * ccb_count : Number of ccbs submitted via dax_ioctl_ccb_exec
+ * fail_count : Number of ccbs that failed the submission via dax_ioctl_ccb_exec
+ */
+struct dax_ctx {
+       struct task_struct              *owner;
+       struct list_head                ctx_list;
+       struct dax_mm                   *dax_mm;
+       union ccb                       *ccb_buf;
+       u64                             ccb_buf_ra;
+       /*
+        * The array is used to hold a *page for each locked page. And each VA
+        * type in a ccb will need an entry in this. The other
+        * dimension of the array is to hold this quad for each ccb.
+        */
+       struct page                     **pages[AT_MAX];
+       struct ccb_completion_area      *ca_buf;
+       u64                             ca_buf_ra;
+       u32                             ccb_buflen;
+       u32                             ccb_submit_maxlen;
+       u32                             ca_buflen;
+       /* BIP related variables */
+       u32                             a_start;
+       u32                             a_end;
+       u32                             b_end;
+       u32                             resv_start;
+       u32                             resv_end;
+       u32                             bufcnt;
+       u32                             ccb_count;
+       u32                             fail_count;
+};
+
+int dax_alloc_page_arrays(struct dax_ctx *ctx);
+void dax_dealloc_page_arrays(struct dax_ctx *ctx);
+void dax_unlock_pages_ccb(struct dax_ctx *ctx, int ccb_num, union ccb *ccbp,
+                         bool warn);
+void dax_prt_ccbs(union ccb *ccb, u64 len);
+bool dax_has_flow_ctl_numa(void);
+long dax_perfcount_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
+union ccb *dax_ccb_buffer_reserve(struct dax_ctx *ctx, size_t len,
+                                 size_t *reserved);
+void dax_ccb_buffer_commit(struct dax_ctx *ctx, size_t len);
+int dax_ccb_buffer_get_contig_ccbs(struct dax_ctx *ctx, int *len_ccb);
+void dax_ccb_buffer_decommit(struct dax_ctx *ctx, int n_ccb);
+int dax_devmap(struct file *f, struct vm_area_struct *vma);
+void dax_vm_open(struct vm_area_struct *vma);
+void dax_vm_close(struct vm_area_struct *vma);
+void dax_overflow_check(struct dax_ctx *ctx, int idx);
+int dax_clean_dm(struct dax_mm *dm);
+void dax_ccbs_drain(struct dax_ctx *ctx, struct dax_vma *dv);
+void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb,
+                    size_t ccb_len);
+int dax_lock_pages(struct dax_ctx *dax_ctx, union ccb *ccb,
+                         size_t ccb_len);
+void dax_unlock_pages(struct dax_ctx *dax_ctx, union ccb *ccb,
+                            size_t ccb_len);
+int dax_address_in_use(struct dax_vma *dv, u32 addr_type,
+                             unsigned long addr);
+void dax_debugfs_init(void);
+void dax_debugfs_clean(void);
+#endif /* _DAX_IMPL_H */
diff --git a/arch/sparc/dax/dax_main.c b/arch/sparc/dax/dax_main.c
new file mode 100644 (file)
index 0000000..3f637bd
--- /dev/null
@@ -0,0 +1,1102 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+
+int dax_ccb_wait_usec = DAX_CCB_WAIT_USEC;
+int dax_ccb_wait_retries_max = DAX_CCB_WAIT_RETRIES_MAX;
+LIST_HEAD(dax_mm_list);
+DEFINE_SPINLOCK(dm_list_lock);
+
+atomic_t dax_alloc_counter = ATOMIC_INIT(0);
+atomic_t dax_requested_mem = ATOMIC_INIT(0);
+
+int dax_debug;
+bool dax_no_flow_ctl;
+
+/* driver public entry points */
+static long dax_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
+static int dax_close(struct inode *i, struct file *f);
+
+/* internal */
+static struct dax_ctx *dax_ctx_alloc(void);
+static int dax_ioctl_ccb_thr_init(void *, struct file *);
+static int dax_ioctl_ccb_thr_fini(struct file *f);
+static int dax_ioctl_ccb_exec(void *, struct file *);
+static int dax_ioctl_ca_dequeue(void *, struct file *f);
+static int dax_validate_ca_dequeue_args(struct dax_ctx *,
+                                       struct dax_ca_dequeue_arg *);
+static int dax_ccb_hv_submit(struct dax_ctx *, union ccb *, size_t,
+                            struct dax_ccb_exec_arg *);
+static int dax_validate_ccb(union ccb *);
+static int dax_preprocess_usr_ccbs(struct dax_ctx *, union ccb *, size_t);
+static void dax_ctx_fini(struct dax_ctx *);
+static void dax_ctx_flush_decommit_ccbs(struct dax_ctx *);
+static int dax_ccb_flush_contig(struct dax_ctx *, int, int, bool);
+static void dax_ccb_wait(struct dax_ctx *, int);
+static void dax_state_destroy(struct file *f);
+
+static int dax_type;
+static long dax_version = DAX_DRIVER_VERSION;
+static u32 dax_hv_ccb_submit_maxlen;
+static dev_t first;
+static struct cdev c_dev;
+static struct class *cl;
+static int force;
+module_param(force, int, 0644);
+MODULE_PARM_DESC(force, "Forces module loading if no device present");
+module_param(dax_debug, int, 0644);
+MODULE_PARM_DESC(dax_debug, "Debug flags");
+
+static const struct file_operations dax_fops = {
+       .owner =    THIS_MODULE,
+       .mmap =     dax_devmap,
+       .release =  dax_close,
+       .unlocked_ioctl = dax_ioctl
+};
+
+static int hv_get_hwqueue_size(unsigned long *qsize)
+{
+       long dummy;
+
+       /* ccb = NULL, length = 0, Q type = query, VQ token = 0 */
+       return sun4v_dax_ccb_submit(0, 0, HV_DAX_QUERY_CMD, 0, qsize, &dummy);
+}
+
+static int __init dax_attach(void)
+{
+       unsigned long minor = DAX_MINOR;
+       unsigned long max_ccbs;
+       int ret = 0, found_dax = 0;
+       struct mdesc_handle *hp = mdesc_grab();
+       u64 pn;
+       char *msg;
+
+       if (hp == NULL) {
+               dax_err("Unable to grab mdesc");
+               return -ENODEV;
+       }
+
+       mdesc_for_each_node_by_name(hp, pn, "virtual-device") {
+               int len;
+               char *prop;
+
+               prop = (char *) mdesc_get_property(hp, pn, "name", &len);
+               if (prop == NULL)
+                       continue;
+               if (strncmp(prop, "dax", strlen("dax")))
+                       continue;
+               dax_dbg("Found node 0x%llx = %s",  pn, prop);
+
+               prop = (char *) mdesc_get_property(hp, pn, "compatible", &len);
+               if (prop == NULL)
+                       continue;
+               if (strncmp(prop, DAX1_STR, strlen(DAX1_STR)))
+                       continue;
+               dax_dbg("Found node 0x%llx = %s",  pn, prop);
+
+               if (!strncmp(prop, DAX1_FC_STR, strlen(DAX1_FC_STR))) {
+                       msg = "dax1-flow-control";
+                       dax_type = DAX1;
+               } else if (!strncmp(prop, DAX2_STR, strlen(DAX2_STR))) {
+                       msg = "dax2";
+                       dax_type = DAX2;
+               } else if (!strncmp(prop, DAX1_STR, strlen(DAX1_STR))) {
+                       msg = "dax1-no-flow-control";
+                       dax_no_flow_ctl = true;
+                       dax_type = DAX1;
+               } else {
+                       break;
+               }
+               found_dax = 1;
+               dax_dbg("MD indicates %s chip",  msg);
+               break;
+       }
+
+       if (found_dax == 0) {
+               dax_err("No DAX device found");
+               if ((force & FORCE_LOAD_ON_ERROR) == 0) {
+                       ret = -ENODEV;
+                       goto done;
+               }
+       }
+
+       dax_dbg("Registering DAX HV api with minor %ld", minor);
+       if (sun4v_hvapi_register(HV_GRP_M7_DAX, DAX_MAJOR, &minor)) {
+               dax_err("hvapi_register failed");
+               if ((force & FORCE_LOAD_ON_ERROR) == 0) {
+                       ret = -ENODEV;
+                       goto done;
+               }
+       } else {
+               dax_dbg("Max minor supported by HV = %ld", minor);
+               minor = min(minor, DAX_MINOR);
+               dax_dbg("registered DAX major %ld minor %ld ",
+                                DAX_MAJOR, minor);
+       }
+
+       ret = hv_get_hwqueue_size(&max_ccbs);
+       if (ret != 0) {
+               dax_err("get_hwqueue_size failed with status=%d and max_ccbs=%ld",
+                       ret, max_ccbs);
+               if (force & FORCE_LOAD_ON_ERROR) {
+                       max_ccbs = DAX_DEFAULT_MAX_CCB;
+               } else {
+                       ret = -ENODEV;
+                       goto done;
+               }
+       }
+
+       dax_hv_ccb_submit_maxlen = (u32)NCCB_TO_CCB_BYTE(max_ccbs);
+       if (max_ccbs == 0 || max_ccbs > U16_MAX) {
+               dax_err("Hypervisor reports nonsensical max_ccbs");
+               if ((force & FORCE_LOAD_ON_ERROR) == 0) {
+                       ret = -ENODEV;
+                       goto done;
+               }
+       }
+
+       /* Older M7 CPUs (pre-3.0) has bug in the flow control feature.  Since
+        * MD does not report it in old versions of HV, we need to explicitly
+        * check for flow control feature.
+        */
+       if ((dax_type == DAX1) && !dax_has_flow_ctl_numa()) {
+               dax_dbg("Flow control disabled, dax_alloc restricted to 4M");
+               dax_no_flow_ctl = true;
+       } else {
+               dax_dbg("Flow control enabled");
+               dax_no_flow_ctl = false;
+       }
+
+       if (force & FORCE_LOAD_ON_NO_FLOW_CTL) {
+               dax_no_flow_ctl = !dax_no_flow_ctl;
+               dax_info("Force option %d. dax_no_flow_ctl %s",
+                        force, dax_no_flow_ctl ? "true" : "false");
+       }
+
+       if (alloc_chrdev_region(&first, 0, 1, "dax") < 0) {
+               ret = -ENXIO;
+               goto done;
+       }
+
+       cl = class_create(THIS_MODULE, "dax");
+       if (cl == NULL) {
+               dax_err("class_create failed");
+               ret = -ENXIO;
+               goto class_error;
+       }
+
+       if (device_create(cl, NULL, first, NULL, "dax") == NULL) {
+               dax_err("device_create failed");
+               ret = -ENXIO;
+               goto device_error;
+       }
+
+       cdev_init(&c_dev, &dax_fops);
+       if (cdev_add(&c_dev, first, 1) == -1) {
+               dax_err("cdev_add failed");
+               ret = -ENXIO;
+               goto cdev_error;
+       }
+
+       dax_debugfs_init();
+       dax_info("Attached DAX module");
+       goto done;
+
+cdev_error:
+       device_destroy(cl, first);
+device_error:
+       class_destroy(cl);
+class_error:
+       unregister_chrdev_region(first, 1);
+done:
+       mdesc_release(hp);
+       return ret;
+}
+
+static void __exit dax_detach(void)
+{
+       dax_info("Cleaning up DAX module");
+       if (!list_empty(&dax_mm_list))
+               dax_warn("dax_mm_list is not empty");
+       dax_info("dax_alloc_counter = %d",  atomic_read(&dax_alloc_counter));
+       dax_info("dax_requested_mem = %dk",  atomic_read(&dax_requested_mem));
+       cdev_del(&c_dev);
+       device_destroy(cl, first);
+       class_destroy(cl);
+       unregister_chrdev_region(first, 1);
+       dax_debugfs_clean();
+}
+module_init(dax_attach);
+module_exit(dax_detach);
+MODULE_LICENSE("GPL");
+
+/*
+ * Logic of opens, closes, threads, contexts:
+ *
+ * open()/close()
+ *
+ * A thread may open the dax device as many times as it likes, but
+ * each open must be bound to a separate thread before it can be used
+ * to submit a transaction.
+ *
+ * The DAX_CCB_THR_INIT ioctl is called to create a context for the
+ * calling thread and bind it to the file descriptor associated with
+ * the ioctl. A thread must always use the fd to which it is bound.
+ * A thread cannot bind to more than one fd, and one fd cannot be
+ * bound to more than one thread.
+ *
+ * When a thread is finished, it should call the DAX_CCB_THR_FINI
+ * ioctl to inform us that its context is no longer needed. This is
+ * optional since close() will have the same effect for the context
+ * associated with the fd being closed. However, if the thread dies
+ * with its context still associated with the fd, then the fd cannot
+ * ever be used again by another thread.
+ *
+ * The DAX_CA_DEQUEUE ioctl informs the driver that one or more
+ * (contiguous) chunks of completion area buffers are no longer needed
+ * and can be reused.
+ *
+ * The DAX_CCB_EXEC submits a coprocessor transaction using the
+ * calling thread's context, which must match the context associated
+ * with the associated fd.
+ *
+ */
+
+static int dax_close(struct inode *i, struct file *f)
+{
+       dax_state_destroy(f);
+       return 0;
+}
+
+static long dax_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+       dax_dbg("cmd=0x%x, f=%p, priv=%p",  cmd, f, f->private_data);
+       switch (cmd) {
+       case DAXIOC_CCB_THR_INIT:
+               return dax_ioctl_ccb_thr_init((void *)arg, f);
+       case DAXIOC_CCB_THR_FINI:
+               return dax_ioctl_ccb_thr_fini(f);
+       case DAXIOC_CA_DEQUEUE:
+               return dax_ioctl_ca_dequeue((void *)arg, f);
+       case DAXIOC_CCB_EXEC:
+               return dax_ioctl_ccb_exec((void *)arg, f);
+       case DAXIOC_VERSION:
+               if (copy_to_user((void __user *)arg, &dax_version,
+                                sizeof(dax_version)))
+                       return -EFAULT;
+               return 0;
+       case DAXIOC_DEP_1:
+       case DAXIOC_DEP_3:
+       case DAXIOC_DEP_4:
+               dax_err("Old version of libdax in use. Please update");
+               return -ENOTTY;
+       default:
+               return dax_perfcount_ioctl(f, cmd, arg);
+       }
+}
+
+static void dax_state_destroy(struct file *f)
+{
+       struct dax_ctx *ctx = (struct dax_ctx *) f->private_data;
+
+       if (ctx != NULL) {
+               dax_ctx_flush_decommit_ccbs(ctx);
+               f->private_data = NULL;
+               dax_ctx_fini(ctx);
+       }
+}
+
+static int dax_ioctl_ccb_thr_init(void *arg, struct file *f)
+{
+       struct dax_ccb_thr_init_arg usr_args;
+       struct dax_ctx *ctx;
+
+       ctx = (struct dax_ctx *) f->private_data;
+
+       /* Only one thread per open can create a context */
+       if (ctx != NULL) {
+               if (ctx->owner != current) {
+                       dax_err("This open already has an associated thread");
+                       return -EUSERS;
+               }
+               dax_err("duplicate CCB_THR_INIT ioctl");
+               return -EINVAL;
+       }
+
+       if (copy_from_user(&usr_args, (void __user *)arg, sizeof(usr_args))) {
+               dax_err("invalid user args\n");
+               return -EFAULT;
+       }
+
+       dax_dbg("pid=%d, ccb_maxlen = %d",  current->pid,
+               usr_args.dcti_ccb_buf_maxlen);
+
+       usr_args.dcti_compl_maplen = DAX_MMAP_SZ;
+       usr_args.dcti_compl_mapoff = DAX_MMAP_OFF;
+       usr_args.dcti_ccb_buf_maxlen = dax_hv_ccb_submit_maxlen;
+
+       if (copy_to_user((void __user *)arg, &usr_args,
+                        sizeof(usr_args))) {
+               dax_err("copyout dax_ccb_thr_init_arg failed");
+               return -EFAULT;
+       }
+
+       ctx = dax_ctx_alloc();
+
+       if (ctx == NULL) {
+               dax_err("dax_ctx_alloc failed.");
+               return -ENOMEM;
+       }
+       ctx->owner = current;
+       f->private_data = ctx;
+       return 0;
+}
+
+static int dax_ioctl_ccb_thr_fini(struct file *f)
+{
+       struct dax_ctx *ctx = (struct dax_ctx *) f->private_data;
+
+       if (ctx == NULL) {
+               dax_err("CCB_THR_FINI ioctl called without previous CCB_THR_INIT ioctl");
+               return -EINVAL;
+       }
+
+       if (ctx->owner != current) {
+               dax_err("CCB_THR_FINI ioctl called from wrong thread");
+               return -EINVAL;
+       }
+
+       dax_state_destroy(f);
+
+       return 0;
+}
+
+static int dax_ioctl_ca_dequeue(void *arg, struct file *f)
+{
+       struct dax_ctx *dax_ctx = (struct dax_ctx *) f->private_data;
+       struct dax_ca_dequeue_arg usr_args;
+       int n_remain, n_avail, n_dq;
+       int start_idx, end_idx;
+       int rv = 0;
+       int i;
+
+       if (dax_ctx == NULL) {
+               dax_err("CCB_INIT ioctl not previously called");
+               rv = -ENOENT;
+               goto ca_dequeue_error;
+       }
+
+       if (dax_ctx->owner != current) {
+               dax_err("wrong thread");
+               rv = -EUSERS;
+               goto ca_dequeue_error;
+       }
+
+       if (copy_from_user(&usr_args, (void __user *)arg, sizeof(usr_args))) {
+               rv = -EFAULT;
+               goto ca_dequeue_error;
+       }
+
+       dax_dbg("dcd_len_requested=%d", usr_args.dcd_len_requested);
+
+       if (dax_validate_ca_dequeue_args(dax_ctx, &usr_args)) {
+               rv = -EINVAL;
+               goto ca_dequeue_end;
+       }
+
+       /* The user length has been validated.  If the kernel queue is empty,
+        * return EINVAL.  Else, check that each CCB CA has completed in HW.
+        * If any CCB CA has not completed, return EBUSY.
+        *
+        * The user expects the length to be deqeueued in terms of CAs starting
+        * from the last dequeued CA. The driver keeps track of CCBs in terms
+        * of CCBs itself.
+        */
+       n_remain = CA_BYTE_TO_NCCB(usr_args.dcd_len_requested);
+       dax_dbg("number of CCBs to dequeue = %d", n_remain);
+       usr_args.dcd_len_dequeued = 0;
+
+       for (i = 0; i < DAX_BIP_MAX_CONTIG_BLOCKS && n_remain > 0; i++) {
+               start_idx = dax_ccb_buffer_get_contig_ccbs(dax_ctx, &n_avail);
+
+               dax_dbg("%d number of contig CCBs available starting from idx = %d",
+                        n_avail, start_idx);
+               if (start_idx < 0 || n_avail == 0) {
+                       dax_err("cannot get contiguous buffer start = %d, n_avail = %d",
+                                start_idx, n_avail);
+                       rv = -EIO;
+                       goto ca_dequeue_end;
+               }
+
+               n_dq = min(n_remain, n_avail);
+               end_idx = start_idx + n_dq;
+
+               if (dax_ccb_flush_contig(dax_ctx, start_idx, end_idx, false)) {
+                       rv = -EBUSY;
+                       goto ca_dequeue_end;
+               }
+
+               /* Free buffer. Update accounting. */
+               dax_ccb_buffer_decommit(dax_ctx, n_dq);
+
+               usr_args.dcd_len_dequeued += NCCB_TO_CA_BYTE(n_dq);
+               n_remain -= n_dq;
+
+               if (n_remain > 0)
+                       dax_dbg("checking additional ccb_buffer contig block, n_remain=%d",
+                                n_remain);
+       }
+
+ca_dequeue_end:
+       dax_dbg("copyout CA's dequeued in bytes =%d",
+               usr_args.dcd_len_dequeued);
+
+       if (copy_to_user((void __user *)arg, &usr_args, sizeof(usr_args))) {
+               dax_err("copyout dax_ca_dequeue_arg failed");
+               rv = -EFAULT;
+               goto ca_dequeue_error;
+       }
+
+ca_dequeue_error:
+       return rv;
+}
+
+static int dax_validate_ca_dequeue_args(struct dax_ctx *dax_ctx,
+                            struct dax_ca_dequeue_arg *usr_args)
+{
+       /* requested len must be multiple of completion area size */
+       if ((usr_args->dcd_len_requested % sizeof(struct ccb_completion_area))
+           != 0) {
+               dax_err("dequeue len (%d) not a muliple of %ldB",
+                        usr_args->dcd_len_requested,
+                        sizeof(struct ccb_completion_area));
+               return -1;
+       }
+
+       /* and not more than current buffer entry count */
+       if (CA_BYTE_TO_NCCB(usr_args->dcd_len_requested) >
+                           CCB_BYTE_TO_NCCB(dax_ctx->bufcnt)) {
+               dax_err("dequeue len (%d bytes, %ld CAs) more than current CA buffer count (%ld CAs)",
+                       usr_args->dcd_len_requested,
+                       CA_BYTE_TO_NCCB(usr_args->dcd_len_requested),
+                       CCB_BYTE_TO_NCCB(dax_ctx->bufcnt));
+               return -1;
+       }
+
+       /* reject zero length */
+       if (usr_args->dcd_len_requested == 0)
+               return -1;
+
+       return 0;
+}
+
+static struct dax_ctx *
+dax_ctx_alloc(void)
+{
+       struct dax_ctx *dax_ctx;
+       struct dax_mm *dm = NULL;
+       struct list_head *p;
+
+       dax_ctx = kzalloc(sizeof(struct dax_ctx), GFP_KERNEL);
+       if (dax_ctx == NULL)
+               goto done;
+
+       BUILD_BUG_ON(((DAX_CCB_BUF_SZ) & ((DAX_CCB_BUF_SZ) - 1)) != 0);
+       /* allocate CCB buffer */
+       dax_ctx->ccb_buf = kmalloc(DAX_CCB_BUF_SZ, GFP_KERNEL);
+       if (dax_ctx->ccb_buf == NULL)
+               goto ccb_buf_error;
+
+       dax_ctx->ccb_buf_ra = virt_to_phys(dax_ctx->ccb_buf);
+       dax_ctx->ccb_buflen = DAX_CCB_BUF_SZ;
+       dax_ctx->ccb_submit_maxlen = dax_hv_ccb_submit_maxlen;
+
+       dax_dbg("dax_ctx->ccb_buf=0x%p, ccb_buf_ra=0x%llx, ccb_buflen=%d",
+               (void *)dax_ctx->ccb_buf, dax_ctx->ccb_buf_ra,
+               dax_ctx->ccb_buflen);
+
+       BUILD_BUG_ON(((DAX_CA_BUF_SZ) & ((DAX_CA_BUF_SZ) - 1)) != 0);
+       /* allocate CCB completion area buffer */
+       dax_ctx->ca_buf = kzalloc(DAX_CA_BUF_SZ, GFP_KERNEL);
+       if (dax_ctx->ca_buf == NULL)
+               goto ca_buf_error;
+
+       dax_ctx->ca_buflen = DAX_CA_BUF_SZ;
+       dax_ctx->ca_buf_ra = virt_to_phys(dax_ctx->ca_buf);
+       dax_dbg("allocated 0x%x bytes for ca_buf", dax_ctx->ca_buflen);
+
+       /* allocate page array */
+       if (dax_alloc_page_arrays(dax_ctx))
+               goto ctx_pages_error;
+
+       /* initialize buffer accounting */
+       dax_ctx->a_start = 0;
+       dax_ctx->a_end = 0;
+       dax_ctx->b_end = 0;
+       dax_ctx->resv_start = 0;
+       dax_ctx->resv_end = 0;
+       dax_ctx->bufcnt = 0;
+       dax_ctx->ccb_count = 0;
+       dax_ctx->fail_count = 0;
+
+       dax_dbg("dax_ctx=0x%p, dax_ctx->ca_buf=0x%p, ca_buf_ra=0x%llx, ca_buflen=%d",
+               (void *)dax_ctx, (void *)dax_ctx->ca_buf,
+               dax_ctx->ca_buf_ra, dax_ctx->ca_buflen);
+
+       /* look for existing mm context */
+       spin_lock(&dm_list_lock);
+       list_for_each(p, &dax_mm_list) {
+               dm = list_entry(p, struct dax_mm, mm_list);
+               if (dm->this_mm == current->mm) {
+                       dax_ctx->dax_mm = dm;
+                       dax_map_dbg("existing dax_mm found: %p", dm);
+                       break;
+               }
+       }
+
+       /* did not find an existing one, must create it */
+       if (dax_ctx->dax_mm == NULL) {
+               dm = kmalloc(sizeof(*dm), GFP_KERNEL);
+               if (dm == NULL) {
+                       spin_unlock(&dm_list_lock);
+                       goto dm_error;
+               }
+
+               INIT_LIST_HEAD(&dm->mm_list);
+               INIT_LIST_HEAD(&dm->ctx_list);
+               spin_lock_init(&dm->lock);
+               dm->this_mm = current->mm;
+               dm->vma_count = 0;
+               dm->ctx_count = 0;
+               list_add(&dm->mm_list, &dax_mm_list);
+               dax_ctx->dax_mm = dm;
+               dax_map_dbg("no dax_mm found, creating and adding to dax_mm_list: %p",
+                           dm);
+       }
+       spin_unlock(&dm_list_lock);
+       /* now add this ctx to the list of threads for this mm context */
+       INIT_LIST_HEAD(&dax_ctx->ctx_list);
+       spin_lock(&dm->lock);
+       list_add(&dax_ctx->ctx_list, &dax_ctx->dax_mm->ctx_list);
+       dax_ctx->dax_mm->ctx_count++;
+       spin_unlock(&dm->lock);
+
+       dax_dbg("allocated ctx %p", dax_ctx);
+       goto done;
+
+dm_error:
+       dax_dealloc_page_arrays(dax_ctx);
+ctx_pages_error:
+       kfree(dax_ctx->ca_buf);
+ca_buf_error:
+       kfree(dax_ctx->ccb_buf);
+ccb_buf_error:
+       kfree(dax_ctx);
+       dax_ctx = NULL;
+done:
+       return dax_ctx;
+}
+
+static void dax_ctx_fini(struct dax_ctx *ctx)
+{
+       int i, j;
+       struct dax_mm *dm;
+
+       kfree(ctx->ccb_buf);
+       ctx->ccb_buf = NULL;
+
+       kfree(ctx->ca_buf);
+       ctx->ca_buf = NULL;
+
+       for (i = 0; i < DAX_CCB_BUF_NELEMS; i++)
+               for (j = 0; j < AT_MAX ; j++)
+                       if (ctx->pages[j][i] != NULL)
+                               dax_err("still not freed pages[%d] = %p",
+                                            j, ctx->pages[j][i]);
+
+       dax_dealloc_page_arrays(ctx);
+
+       dm = ctx->dax_mm;
+       if (dm == NULL) {
+               dax_err("dm is NULL");
+       } else {
+               spin_lock(&dm->lock);
+               list_del(&ctx->ctx_list);
+               /*
+                * dm is deallocated here. So no need to unlock dm->lock if the
+                * function succeeds
+                */
+               if (dax_clean_dm(dm))
+                       spin_unlock(&dm->lock);
+       }
+
+       dax_drv_dbg("CCB count: %d good, %d failed", ctx->ccb_count,
+                   ctx->fail_count);
+       kfree(ctx);
+}
+
+static int dax_validate_ccb(union ccb *ccb)
+{
+       struct ccb_hdr *hdr = CCB_HDR(ccb);
+       int ret = -EINVAL;
+
+       /*
+        * The user is not allowed to specify real address types
+        * in the CCB header.  This must be enforced by the kernel
+        * before submitting the CCBs to HV.
+        *
+        * The allowed values are:
+        *      hdr->at_dst     VA/IMM only
+        *      hdr->at_src0    VA/IMM only
+        *      hdr->at_src1    VA/IMM only
+        *      hdr->at_tbl     VA/IMM only
+        *
+        * Note: IMM is only valid for certain opcodes, but the kernel is not
+        * validating at this level of granularity.  The HW will flag invalid
+        * address types.  The required check is that the user must not be
+        * allowed to specify real address types.
+        */
+
+       DAX_VALIDATE_AT(hdr, dst, done);
+       DAX_VALIDATE_AT(hdr, src0, done);
+       DAX_VALIDATE_AT(hdr, src1, done);
+       DAX_VALIDATE_AT(hdr, tbl, done);
+       ret = 0;
+done:
+       return ret;
+}
+
+void dax_prt_ccbs(union ccb *ccb, u64 len)
+{
+       int nelem = CCB_BYTE_TO_NCCB(len);
+       int i, j;
+
+       dax_dbg("ccb buffer (processed):");
+       for (i = 0; i < nelem; i++) {
+               dax_dbg("%sccb[%d]", IS_LONG_CCB(&ccb[i]) ? "long " : "",  i);
+               for (j = 0; j < DWORDS_PER_CCB; j++)
+                       dax_dbg("\tccb[%d].dwords[%d]=0x%llx",
+                               i, j, ccb[i].dwords[j]);
+       }
+}
+
+static int dax_ioctl_ccb_exec(void *arg, struct file *f)
+{
+       struct dax_ccb_exec_arg usr_args;
+       struct dax_ctx *dax_ctx = (struct dax_ctx *) f->private_data;
+       union ccb *ccb_buf;
+       size_t nreserved;
+       int rv, hv_rv;
+
+       if (dax_ctx == NULL) {
+               dax_err("CCB_INIT ioctl not previously called");
+               return -ENOENT;
+       }
+
+       if (dax_ctx->owner != current) {
+               dax_err("wrong thread");
+               return -EUSERS;
+       }
+
+       if (dax_ctx->dax_mm == NULL) {
+               dax_err("dax_ctx initialized incorrectly");
+               return -ENOENT;
+       }
+
+       if (copy_from_user(&usr_args, (void __user *)arg, sizeof(usr_args))) {
+               dax_err("copyin of user args failed");
+               return -EFAULT;
+       }
+
+       if (usr_args.dce_ccb_buf_len > dax_hv_ccb_submit_maxlen ||
+           (usr_args.dce_ccb_buf_len % sizeof(union ccb)) != 0 ||
+           usr_args.dce_ccb_buf_len == 0) {
+               dax_err("invalid usr_args.dce_ccb_len(%d)",
+                       usr_args.dce_ccb_buf_len);
+               return -ERANGE;
+       }
+
+       dax_dbg("args: ccb_buf_len=%d, buf_addr=%p",
+               usr_args.dce_ccb_buf_len, usr_args.dce_ccb_buf_addr);
+
+       /* Check for available buffer space. */
+       ccb_buf = dax_ccb_buffer_reserve(dax_ctx, usr_args.dce_ccb_buf_len,
+                                        &nreserved);
+       dax_dbg("reserved address %p for ccb_buf", ccb_buf);
+
+       /*
+        * We don't attempt a partial submission since that would require extra
+        * logic to not split a long CCB at the end.  This would be an
+        * enhancement.
+        */
+       if (ccb_buf == NULL || nreserved != usr_args.dce_ccb_buf_len) {
+               dax_err("insufficient kernel CCB resources: user needs to free completion area space and retry");
+               return -ENOBUFS;
+       }
+
+       /*
+        * Copy user CCBs.  Here we copy the entire user buffer and later
+        * validate the contents by running the buffer.
+        */
+       if (copy_from_user(ccb_buf, (void __user *)usr_args.dce_ccb_buf_addr,
+                          usr_args.dce_ccb_buf_len)) {
+               dax_err("copyin of user CCB buffer failed");
+               return -EFAULT;
+       }
+
+       rv = dax_preprocess_usr_ccbs(dax_ctx, ccb_buf,
+                                    usr_args.dce_ccb_buf_len);
+
+       if (rv != 0)
+               return rv;
+
+       dax_map_segment(dax_ctx, ccb_buf, usr_args.dce_ccb_buf_len);
+
+       rv = dax_lock_pages(dax_ctx, ccb_buf, usr_args.dce_ccb_buf_len);
+       if (rv != 0)
+               return rv;
+
+       hv_rv = dax_ccb_hv_submit(dax_ctx, ccb_buf, usr_args.dce_ccb_buf_len,
+                                 &usr_args);
+
+       /* Update based on actual number of submitted CCBs. */
+       if (hv_rv == 0) {
+               dax_ccb_buffer_commit(dax_ctx,
+                                     usr_args.dce_submitted_ccb_buf_len);
+               dax_ctx->ccb_count++;
+       } else {
+               dax_ctx->fail_count++;
+               dax_dbg("submit failed, status=%d, nomap=0x%llx",
+                        usr_args.dce_ccb_status, usr_args.dce_nomap_va);
+               dax_unlock_pages(dax_ctx, ccb_buf, usr_args.dce_ccb_buf_len);
+       }
+
+       dax_dbg("copyout dce_submitted_ccb_buf_len=%d, dce_ca_region_off=%lld, dce_ccb_status=%d",
+               usr_args.dce_submitted_ccb_buf_len, usr_args.dce_ca_region_off,
+               usr_args.dce_ccb_status);
+
+       if (copy_to_user((void __user *)arg, &usr_args, sizeof(usr_args))) {
+               dax_err("copyout of dax_ccb_exec_arg failed");
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+
+/*
+ * Validates user CCB content.  Also sets completion address and address types
+ * for all addresses contained in CCB.
+ */
+static int dax_preprocess_usr_ccbs(struct dax_ctx *dax_ctx, union ccb *ccb,
+                                  size_t ccb_len)
+{
+       int i;
+       int nelem = CCB_BYTE_TO_NCCB(ccb_len);
+
+       for (i = 0; i < nelem; i++) {
+               struct ccb_hdr *hdr = CCB_HDR(&ccb[i]);
+               u32 idx;
+               ptrdiff_t ca_offset;
+
+               /* enforce validation checks */
+               if (dax_validate_ccb(&ccb[i])) {
+                       dax_dbg("ccb[%d] invalid ccb", i);
+                       return -ENOKEY;
+               }
+
+               /* change all virtual address types to virtual alternate */
+               if (hdr->at_src0 == CCB_AT_VA)
+                       hdr->at_src0 = CCB_AT_VA_ALT;
+               if (hdr->at_src1 == CCB_AT_VA)
+                       hdr->at_src1 = CCB_AT_VA_ALT;
+               if (hdr->at_dst == CCB_AT_VA)
+                       hdr->at_dst = CCB_AT_VA_ALT;
+               if (hdr->at_tbl == CCB_AT_VA)
+                       hdr->at_tbl = CCB_AT_VA_ALT;
+
+               /* set completion (real) address and address type */
+               hdr->at_cmpl = CCB_AT_RA;
+
+               idx = &ccb[i] - dax_ctx->ccb_buf;
+               ca_offset = (uintptr_t)&dax_ctx->ca_buf[idx] -
+                               (uintptr_t)dax_ctx->ca_buf;
+
+               dax_dbg("ccb[%d]=0x%p, ccb_buf=0x%p, idx=%d, ca_offset=0x%lx, ca_buf_ra=0x%llx",
+                       i, (void *)&ccb[i], (void *)dax_ctx->ccb_buf, idx,
+                       ca_offset, dax_ctx->ca_buf_ra);
+
+               dax_dbg("ccb[%d] setting completion RA=0x%llx",
+                       i, dax_ctx->ca_buf_ra + ca_offset);
+
+               CCB_SET_COMPL_PA(dax_ctx->ca_buf_ra + ca_offset,
+                   ccb[i].dwords[CCB_DWORD_COMPL]);
+               memset((void *)((unsigned long)dax_ctx->ca_buf + ca_offset),
+                      0, sizeof(struct ccb_completion_area));
+
+               /* skip over 2nd 64 bytes of long CCB */
+               if (IS_LONG_CCB(&ccb[i]))
+                       i++;
+       }
+
+       return 0;
+}
+
+static int dax_ccb_hv_submit(struct dax_ctx *dax_ctx, union ccb *ccb_buf,
+                            size_t buflen, struct dax_ccb_exec_arg *exec_arg)
+{
+       unsigned long submitted_ccb_buf_len = 0;
+       unsigned long nomap_va = 0;
+       unsigned long hv_rv = HV_ENOMAP;
+       int rv = -EIO;
+       ptrdiff_t offset;
+
+       offset = (uintptr_t)ccb_buf - (uintptr_t)dax_ctx->ccb_buf;
+
+       dax_dbg("ccb_buf=0x%p, buflen=%ld, offset=0x%lx, ccb_buf_ra=0x%llx ",
+               (void *)ccb_buf, buflen, offset,
+               dax_ctx->ccb_buf_ra + offset);
+
+       if (dax_debug & DAX_DBG_FLG_BASIC)
+               dax_prt_ccbs(ccb_buf, buflen);
+
+       /* hypercall */
+       hv_rv = sun4v_dax_ccb_submit((void *) dax_ctx->ccb_buf_ra +
+                                    offset, buflen,
+                                    HV_DAX_QUERY_CMD |
+                                    HV_DAX_CCB_VA_SECONDARY, 0,
+                                    &submitted_ccb_buf_len, &nomap_va);
+
+       if (dax_debug & DAX_DBG_FLG_BASIC)
+               dax_prt_ccbs(ccb_buf, buflen);
+
+       exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_INTERNAL;
+       exec_arg->dce_submitted_ccb_buf_len = 0;
+       exec_arg->dce_ca_region_off = 0;
+
+       dax_dbg("hcall rv=%ld, submitted_ccb_buf_len=%ld, nomap_va=0x%lx",
+               hv_rv, submitted_ccb_buf_len, nomap_va);
+
+       if (submitted_ccb_buf_len % sizeof(union ccb) != 0) {
+               dax_err("submitted_ccb_buf_len %ld not multiple of ccb size %ld",
+                       submitted_ccb_buf_len, sizeof(union ccb));
+               return rv;
+       }
+
+       switch (hv_rv) {
+       case HV_EOK:
+               /*
+                * Hcall succeeded with no errors but the submitted length may
+                * be less than the requested length.  The only way the kernel
+                * can resubmit the remainder is to wait for completion of the
+                * submitted CCBs since there is no way to guarantee the
+                * ordering semantics required by the client applications.
+                * Therefore we let the user library deal with retransmissions.
+                */
+               rv = 0;
+               exec_arg->dce_ccb_status = DAX_SUBMIT_OK;
+               exec_arg->dce_submitted_ccb_buf_len = submitted_ccb_buf_len;
+               exec_arg->dce_ca_region_off =
+                       NCCB_TO_CA_BYTE(CCB_BYTE_TO_NCCB(offset));
+               break;
+       case HV_EWOULDBLOCK:
+               /*
+                * This is a transient HV API error that we may eventually want
+                * to hide from the user. For now return
+                * DAX_SUBMIT_ERR_WOULDBLOCK and let the user library retry.
+                */
+               dax_err("hcall returned HV_EWOULDBLOCK");
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_WOULDBLOCK;
+               break;
+       case HV_ENOMAP:
+               /*
+                * HV was unable to translate a VA.  The VA it could not
+                * translate is returned in the nomap_va param.
+                */
+               dax_err("hcall returned HV_ENOMAP nomap_va=0x%lx with %d retries",
+                       nomap_va, DAX_NOMAP_RETRIES);
+               exec_arg->dce_nomap_va = nomap_va;
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_NOMAP;
+               break;
+       case HV_EINVAL:
+               /*
+                * This is the result of an invalid user CCB as HV is validating
+                * some of the user CCB fields.  Pass this error back to the
+                * user. There is no supporting info to isolate the invalid
+                * field
+                */
+               dax_err("hcall returned HV_EINVAL");
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_CCB_INVAL;
+               break;
+       case HV_ENOACCESS:
+               /*
+                * HV found a VA that did not have the appropriate permissions
+                * (such as the w bit). The VA in question is returned in
+                * nomap_va param, but there is no specific indication which
+                * CCB had the error.  There is no remedy for the kernel to
+                * correct the failure, so return an appropriate error to the
+                * user.
+                */
+               dax_err("hcall returned HV_ENOACCESS");
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_NOACCESS;
+               exec_arg->dce_nomap_va = nomap_va;
+               break;
+       case HV_EUNAVAILABLE:
+               /*
+                * The requested CCB operation could not be performed at this
+                * time. The restrict-ed operation availability may apply only
+                * to the first unsuccessfully submitted CCB, or may apply to a
+                * larger scope.
+                */
+               dax_err("hcall returned HV_EUNAVAILABLE");
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_UNAVAIL;
+               break;
+       default:
+               exec_arg->dce_ccb_status = DAX_SUBMIT_ERR_INTERNAL;
+               dax_err("unknown hcall return value (%ld)", hv_rv);
+               break;
+       }
+
+       return rv;
+}
+
+/*
+ * Wait for all CCBs to complete and remove from CCB buffer.
+ */
+static void dax_ctx_flush_decommit_ccbs(struct dax_ctx *dax_ctx)
+{
+       int n_contig_ccbs;
+
+       dax_dbg("");
+
+       /* Wait for all CCBs to complete.  Do not remove from CCB buffer */
+       dax_ccb_flush_contig(dax_ctx, CCB_BYTE_TO_NCCB(dax_ctx->a_start),
+                            CCB_BYTE_TO_NCCB(dax_ctx->a_end), true);
+
+       if (dax_ctx->b_end > 0)
+               dax_ccb_flush_contig(dax_ctx, 0,
+                                    CCB_BYTE_TO_NCCB(dax_ctx->b_end),
+                                    true);
+
+       /* decommit all */
+       while (dax_ccb_buffer_get_contig_ccbs(dax_ctx, &n_contig_ccbs) >= 0) {
+               if (n_contig_ccbs == 0)
+                       break;
+               dax_ccb_buffer_decommit(dax_ctx, n_contig_ccbs);
+       }
+}
+
+static int dax_ccb_flush_contig(struct dax_ctx *dax_ctx, int start_idx,
+                               int end_idx, bool wait)
+{
+       int i;
+
+       dax_dbg("start_idx=%d, end_idx=%d", start_idx, end_idx);
+
+       for (i = start_idx; i < end_idx; i++) {
+               u8 status;
+               union ccb *ccb = &dax_ctx->ccb_buf[i];
+
+               if (wait) {
+                       dax_ccb_wait(dax_ctx, i);
+               } else {
+                       status = dax_ctx->ca_buf[i].cmd_status;
+
+                       if (status == CCB_CMD_STAT_NOT_COMPLETED) {
+                               dax_err("CCB completion area status == CCB_CMD_STAT_NOT_COMPLETED: fail request to free completion index=%d",
+                                       i);
+                               return -EBUSY;
+                       }
+               }
+
+               dax_overflow_check(dax_ctx, i);
+               /* free any locked pages associated with this ccb */
+               dax_unlock_pages_ccb(dax_ctx, i, ccb, true);
+
+               if (IS_LONG_CCB(ccb)) {
+                       /*
+                        * Validate that the user must dequeue 2 CAs for a long
+                        * CCB.  In other words, the last entry in a contig
+                        * block cannot be a long CCB.
+                        */
+                       if (i == end_idx) {
+                               dax_err("invalid attempt to dequeue single CA for long CCB, index=%d",
+                                       i);
+                               return -EINVAL;
+                       }
+                       /* skip over 64B data of long CCB */
+                       i++;
+               }
+       }
+       return 0;
+}
+
+static void dax_ccb_wait(struct dax_ctx *dax_ctx, int idx)
+{
+       int nretries = 0;
+
+       dax_dbg("idx=%d", idx);
+
+       while (dax_ctx->ca_buf[idx].cmd_status == CCB_CMD_STAT_NOT_COMPLETED) {
+               udelay(dax_ccb_wait_usec);
+
+               if (++nretries >= dax_ccb_wait_retries_max) {
+                       dax_alert("dax_ctx (0x%p): CCB[%d] did not complete (timed out, wait usec=%d retries=%d). CCB kill will be attempted in future version",
+                               (void *)dax_ctx, idx, dax_ccb_wait_usec,
+                                 dax_ccb_wait_retries_max);
+                       return;
+               }
+       }
+}
+
+static void dax_ccb_drain(struct dax_ctx *ctx, int idx, struct dax_vma *dv)
+{
+       union ccb *ccb;
+       struct ccb_hdr *hdr;
+
+       if (ctx->ca_buf[idx].cmd_status != CCB_CMD_STAT_NOT_COMPLETED)
+               return;
+
+       ccb = &ctx->ccb_buf[idx];
+       hdr = CCB_HDR(ccb);
+
+       if (dax_address_in_use(dv, hdr->at_dst,
+                              ccb->dwords[QUERY_DWORD_OUTPUT])
+               || dax_address_in_use(dv, hdr->at_src0,
+                                     ccb->dwords[QUERY_DWORD_INPUT])
+               || dax_address_in_use(dv, hdr->at_src1,
+                                     ccb->dwords[QUERY_DWORD_SEC_INPUT])
+               || dax_address_in_use(dv, hdr->at_tbl,
+                                     ccb->dwords[QUERY_DWORD_TBL])) {
+               dax_ccb_wait(ctx, idx);
+       }
+}
+
+static void dax_ccbs_drain_contig(struct dax_ctx *ctx, struct dax_vma *dv,
+                                 int start_bytes, int end_bytes)
+{
+       int start_idx = CCB_BYTE_TO_NCCB(start_bytes);
+       int end_idx = CCB_BYTE_TO_NCCB(end_bytes);
+       int i;
+
+       dax_dbg("start_idx=%d, end_idx=%d", start_idx, end_idx);
+
+       for (i = start_idx; i < end_idx; i++) {
+               dax_ccb_drain(ctx, i, dv);
+               if (IS_LONG_CCB(&ctx->ccb_buf[i])) {
+                       /* skip over 64B data of long CCB */
+                       i++;
+               }
+       }
+}
+
+void dax_ccbs_drain(struct dax_ctx *ctx, struct dax_vma *dv)
+{
+       dax_ccbs_drain_contig(ctx, dv, ctx->a_start, ctx->a_end);
+       if (ctx->b_end > 0)
+               dax_ccbs_drain_contig(ctx, dv, 0, ctx->b_end);
+}
diff --git a/arch/sparc/dax/dax_misc.c b/arch/sparc/dax/dax_misc.c
new file mode 100644 (file)
index 0000000..878c88f
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+
+static atomic_t has_flow_ctl  = ATOMIC_INIT(0);
+static atomic_t response_count = ATOMIC_INIT(0);
+
+static int dax_has_flow_ctl_one_node(void)
+{
+       struct ccb_extract *ccb;
+       struct ccb_completion_area *ca;
+       char *mem, *dax_input, *dax_output;
+       unsigned long submitted_ccb_buf_len, nomap_va, hv_rv, ra, va;
+       long timeout;
+       int ret = 0;
+
+       mem = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+       if (mem == NULL)
+               return -ENOMEM;
+
+       va = ALIGN((unsigned long)mem, 128);
+       ccb = (struct ccb_extract *) va;
+       ca = (struct ccb_completion_area *)ALIGN(va + sizeof(*ccb),
+                                                sizeof(*ca));
+       dax_input = (char *)ca + sizeof(*ca);
+       dax_output = (char *)dax_input + (DAX_INPUT_ELEMS * DAX_INPUT_ELEM_SZ);
+
+       ccb->control.hdr.opcode  = CCB_QUERY_OPCODE_EXTRACT;
+
+       /* I/O formats and sizes */
+       ccb->control.src0_fmt = CCB_QUERY_IFMT_FIX_BYTE;
+       ccb->control.src0_sz = 0; /* 1 byte */
+       ccb->control.output_sz = DAX_OUTPUT_ELEM_SZ - 1;
+       ccb->control.output_fmt = CCB_QUERY_OFMT_BYTE_ALIGN;
+
+       /* addresses */
+       *(u64 *)&ccb->src0 = (u64) dax_input;
+       *(u64 *)&ccb->output = (u64) dax_output;
+       *(u64 *)&ccb->completion = (u64) ca;
+
+       /* address types */
+       ccb->control.hdr.at_src0 = CCB_AT_VA;
+       ccb->control.hdr.at_dst  = CCB_AT_VA;
+       ccb->control.hdr.at_cmpl = CCB_AT_VA;
+
+       /* input sizes and output flow control limit */
+       ccb->data_acc_ctl.input_len_fmt = CCB_QUERY_ILF_BYTE;
+       ccb->data_acc_ctl.input_cnt = (DAX_INPUT_ELEMS * DAX_INPUT_ELEM_SZ) - 1;
+       /* try to overflow; 0 means 64B output limit */
+       ccb->data_acc_ctl.output_buf_sz = DAX_FLOW_LIMIT / 64 - 1;
+       ccb->data_acc_ctl.flow_ctl = DAX_BUF_LIMIT_FLOW_CTL;
+
+       ra = virt_to_phys(ccb);
+
+       hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_QUERY_CMD, 0,
+                                    &submitted_ccb_buf_len, &nomap_va);
+       if (hv_rv != HV_EOK) {
+               dax_info("failed dax submit, ret=0x%lx", hv_rv);
+               if (dax_debug & DAX_DBG_FLG_BASIC)
+                       dax_prt_ccbs((union ccb *)ccb, 64);
+               goto done;
+       }
+
+       timeout = 10LL * 1000LL * 1000LL; /* 10ms in ns */
+       while (timeout > 0) {
+               unsigned long status;
+               unsigned long mwait_time = 8192;
+
+               /* monitored load */
+               __asm__ __volatile__("lduba [%1] 0x84, %0\n\t"
+                                    : "=r" (status) : "r" (&ca->cmd_status));
+               if (status == CCB_CMD_STAT_NOT_COMPLETED)
+                       __asm__ __volatile__("wr %0, %%asr28\n\t" /* mwait */
+                                            : : "r" (mwait_time));
+               else
+                       break;
+               timeout = timeout - mwait_time;
+       }
+       if (timeout <= 0) {
+               dax_alert("dax flow control test timed out");
+               ret = -EIO;
+               goto done;
+       }
+
+       if (ca->output_sz != DAX_FLOW_LIMIT) {
+               dax_dbg("0x%x bytes output, differs from flow limit 0x%lx",
+                       ca->output_sz, DAX_FLOW_LIMIT);
+               dax_dbg("mem=%p, va=0x%lx, ccb=%p, ca=%p, out=%p",
+                       mem, va, ccb, ca, dax_output);
+               goto done;
+       }
+
+       ret = 1;
+done:
+       kfree(mem);
+       return ret;
+}
+
+static void dax_has_flow_ctl_client(void *info)
+{
+       int cpu = smp_processor_id();
+       int node = cpu_to_node(cpu);
+       int ret = dax_has_flow_ctl_one_node();
+
+       if (ret > 0) {
+               dax_dbg("DAX on cpu %d node %d has flow control",
+                      cpu, node);
+               atomic_set(&has_flow_ctl, 1);
+       } else if (ret == 0) {
+               dax_dbg("DAX on cpu %d node %d has no flow control",
+                      cpu, node);
+       } else {
+               return;
+       }
+       atomic_inc(&response_count);
+}
+
+bool dax_has_flow_ctl_numa(void)
+{
+       unsigned int node;
+       int cnt = 10000;
+       int nr_nodes = 0;
+       cpumask_t numa_cpu_mask;
+
+       cpumask_clear(&numa_cpu_mask);
+       atomic_set(&has_flow_ctl, 0);
+       atomic_set(&response_count, 0);
+
+       /*
+        * For M7 platforms with multi socket, processors on each socket may be
+        * of different version, thus different DAX version. So it is
+        * necessary to detect the flow control on all the DAXs in the
+        * platform. Select first cpu from each numa node and run the
+        * flow control detection code on those cpus. This makes sure
+        * that the detection code runs on all the DAXs in the platform.
+        */
+       for_each_node_with_cpus(node) {
+               int dst_cpu = cpumask_first(&numa_cpumask_lookup_table[node]);
+
+               cpumask_set_cpu(dst_cpu, &numa_cpu_mask);
+               nr_nodes++;
+       }
+
+       smp_call_function_many(&numa_cpu_mask,
+                              dax_has_flow_ctl_client, NULL, 1);
+       while ((atomic_read(&response_count) != nr_nodes) && --cnt)
+               udelay(100);
+
+       if (cnt == 0) {
+               dax_err("Could not synchronize DAX flow control detector");
+               return false;
+       }
+
+       return !!atomic_read(&has_flow_ctl);
+}
+
+void dax_overflow_check(struct dax_ctx *ctx, int idx)
+{
+       unsigned long output_size, input_size, virtp;
+       unsigned long page_size = PAGE_SIZE;
+       struct ccb_hdr *hdr;
+       union ccb     *ccb;
+       struct ccb_data_acc_ctl *access;
+       struct vm_area_struct *vma;
+       struct ccb_completion_area *ca = &ctx->ca_buf[idx];
+
+       if (dax_debug == 0)
+               return;
+
+       if (ca->cmd_status != CCB_CMD_STAT_FAILED)
+               return;
+
+       if (ca->err_mask != CCB_CMD_ERR_POF)
+               return;
+
+       ccb = &ctx->ccb_buf[idx];
+       hdr = CCB_HDR(ccb);
+
+       access = (struct ccb_data_acc_ctl *) &ccb->dwords[QUERY_DWORD_DAC];
+       output_size = access->output_buf_sz * 64 + 64;
+       input_size  = access->input_cnt + 1;
+
+       dax_dbg("*************************");
+       dax_dbg("*DAX Page Overflow Report:");
+       dax_dbg("*  Output size requested = 0x%lx, output size produced = 0x%x",
+               output_size, ca->output_sz);
+       dax_dbg("*  Input size requested = 0x%lx, input size processed = 0x%x",
+               input_size, ca->n_processed);
+       dax_dbg("*  User virtual address analysis:");
+
+       virtp = ccb->dwords[QUERY_DWORD_OUTPUT];
+
+       if (hdr->at_dst == CCB_AT_RA) {
+               dax_dbg("*   Output address = 0x%lx physical, so no overflow possible",
+                       virtp);
+       } else {
+               /* output buffer was virtual, so page overflow is possible */
+               if (hdr->at_dst == CCB_AT_VA_ALT) {
+                       if (current->mm == NULL)
+                               return;
+
+                       vma = find_vma(current->mm, virtp);
+                       if (vma == NULL)
+                               dax_dbg("*   Output address = 0x%lx but is demapped, which precludes analysis",
+                                       virtp);
+                       else
+                               page_size = vma_kernel_pagesize(vma);
+               } else if (hdr->at_dst == CCB_AT_VA) {
+                       page_size = DAX_SYN_LARGE_PAGE_SIZE;
+               }
+
+               dax_dbg("*   Output address = 0x%lx, page size = 0x%lx; page overflow %s",
+                       virtp, page_size,
+                       (virtp + ca->output_sz >= ALIGN(virtp, page_size)) ?
+                                        "LIKELY" : "UNLIKELY");
+               dax_dbg("*   Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx",
+                       ca->output_sz,
+                       (virtp + ca->output_sz >= ALIGN(virtp, page_size)) ?
+                                        "OUTSIDE" : "WITHIN",
+                       virtp, ALIGN(virtp, page_size));
+       }
+
+       virtp = ccb->dwords[QUERY_DWORD_INPUT];
+       if (hdr->at_src0 == CCB_AT_RA) {
+               dax_dbg("*   Input address = 0x%lx physical, so no overflow possible",
+                       virtp);
+       } else {
+               if (hdr->at_src0 == CCB_AT_VA_ALT) {
+                       if (current->mm == NULL)
+                               return;
+
+                       vma = find_vma(current->mm, virtp);
+                       if (vma == NULL)
+                               dax_dbg("*   Input address = 0x%lx but is demapped, which precludes analysis",
+                                       virtp);
+                       else
+                               page_size = vma_kernel_pagesize(vma);
+               } else if (hdr->at_src0 == CCB_AT_VA) {
+                       page_size = DAX_SYN_LARGE_PAGE_SIZE;
+               }
+
+               dax_dbg("*   Input address = 0x%lx, page size = 0x%lx; page overflow %s",
+                       virtp, page_size,
+                       (virtp + input_size >=
+                        ALIGN(virtp, page_size)) ?
+                                       "LIKELY" : "UNLIKELY");
+               dax_dbg("*   Input size processed (0x%x) is %s the page bounds 0x%lx..0x%lx",
+                       ca->n_processed,
+                       (virtp + ca->n_processed >=
+                        ALIGN(virtp, page_size)) ?
+                                       "OUTSIDE" : "WITHIN",
+                       virtp, ALIGN(virtp, page_size));
+       }
+       dax_dbg("*************************");
+}
diff --git a/arch/sparc/dax/dax_mm.c b/arch/sparc/dax/dax_mm.c
new file mode 100644 (file)
index 0000000..32ccc14
--- /dev/null
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+
+const struct vm_operations_struct dax_vm_ops = {
+       .open  = dax_vm_open,
+       .close = dax_vm_close,
+};
+
+int dax_at_to_ccb_idx[AT_MAX] = {
+       QUERY_DWORD_OUTPUT,
+       QUERY_DWORD_INPUT,
+       QUERY_DWORD_SEC_INPUT,
+       QUERY_DWORD_TBL,
+};
+
+static void dax_vm_print(char *prefix, struct dax_vma *dv)
+{
+       dax_map_dbg("%s : vma %p, kva=%p, uva=0x%lx, pa=0x%lx",
+                 prefix, dv->vma, dv->kva,
+                 dv->vma ? dv->vma->vm_start : 0, dv->pa);
+       dax_map_dbg("%s: req length=0x%lx", prefix, dv->length);
+}
+
+static int dax_alloc_ram(struct file *filp, struct vm_area_struct *vma)
+{
+       unsigned long pa, pfn;
+       char *kva;
+       struct dax_vma *dv;
+       size_t len;
+       int ret = -ENOMEM;
+       struct dax_ctx *dax_ctx = (struct dax_ctx *) filp->private_data;
+
+       len = vma->vm_end - vma->vm_start;
+       if (len & (PAGE_SIZE - 1)) {
+               dax_err("request (0x%lx) not a multiple of page size", len);
+               goto done;
+       }
+
+       if (dax_no_flow_ctl && len != DAX_SYN_LARGE_PAGE_SIZE) {
+               dax_err("unsupported length 0x%lx != 0x%lx virtual page size",
+                       len, DAX_SYN_LARGE_PAGE_SIZE);
+               goto done;
+       }
+
+       dax_map_dbg("requested length=0x%lx", len);
+
+       if (dax_ctx->dax_mm == NULL) {
+               dax_err("no dax_mm for ctx %p!", dax_ctx);
+               goto done;
+       }
+
+       kva = kzalloc(len, GFP_KERNEL);
+       if (kva == NULL)
+               goto done;
+
+       if ((unsigned long)kva & (PAGE_SIZE - 1)) {
+               dax_err("kmalloc returned unaligned (%ld) addr %p",
+                       PAGE_SIZE, kva);
+               goto kva_error;
+       }
+
+       if (dax_no_flow_ctl && ((unsigned long)kva & (len - 1))) {
+               dax_err("kmalloc returned unaligned (%ldk) addr %p",
+                       len/1024, kva);
+               goto kva_error;
+       }
+
+       dv = kzalloc(sizeof(*dv), GFP_KERNEL);
+       if (dv == NULL)
+               goto kva_error;
+
+       pa = virt_to_phys((void *)kva);
+       pfn = pa >> PAGE_SHIFT;
+       ret = remap_pfn_range(vma, vma->vm_start, pfn, len,
+                             vma->vm_page_prot);
+       if (ret != 0) {
+               dax_err("remap failed with error %d for uva 0x%lx, len 0x%lx",
+                       ret, vma->vm_start, len);
+               goto dv_error;
+       }
+
+       dax_map_dbg("mapped kva 0x%lx = uva 0x%lx to pa 0x%lx",
+                   (unsigned long) kva, vma->vm_start, pa);
+
+       dv->vma = vma;
+       dv->kva = kva;
+       dv->pa = pa;
+       dv->length = len;
+       dv->dax_mm = dax_ctx->dax_mm;
+
+       spin_lock(&dax_ctx->dax_mm->lock);
+       dax_ctx->dax_mm->vma_count++;
+       spin_unlock(&dax_ctx->dax_mm->lock);
+       atomic_inc(&dax_alloc_counter);
+       atomic_add(dv->length / 1024, &dax_requested_mem);
+       vma->vm_ops = &dax_vm_ops;
+       vma->vm_private_data = dv;
+
+
+       dax_vm_print("mapped", dv);
+       ret = 0;
+
+       goto done;
+
+dv_error:
+       kfree(dv);
+kva_error:
+       kfree(kva);
+done:
+       return ret;
+}
+
+/*
+ * Maps two types of memory based on the PROT_READ or PROT_WRITE flag
+ * set in the 'prot' argument of mmap user call
+ *     1. When PROT_READ is set this function allocates DAX completion area
+ *     2. When PROT_WRITE is set this function allocates memory using kmalloc
+ *             and maps it to the userspace address.
+ */
+int dax_devmap(struct file *f, struct vm_area_struct *vma)
+{
+       unsigned long pfn;
+       struct dax_ctx *dax_ctx = (struct dax_ctx *) f->private_data;
+       size_t len = vma->vm_end - vma->vm_start;
+
+       dax_dbg("len=0x%lx, flags=0x%lx", len, vma->vm_flags);
+
+       if (dax_ctx == NULL) {
+               dax_err("CCB_INIT ioctl not previously called");
+               return -EINVAL;
+       }
+       if (dax_ctx->owner != current) {
+               dax_err("devmap called from wrong thread");
+               return -EINVAL;
+       }
+
+       if (vma->vm_flags & VM_WRITE)
+               return dax_alloc_ram(f, vma);
+
+       /* map completion area */
+
+       if (len != dax_ctx->ca_buflen) {
+               dax_err("len(%lu) != dax_ctx->ca_buflen(%u)",
+                       len, dax_ctx->ca_buflen);
+               return -EINVAL;
+       }
+
+       pfn = virt_to_phys(dax_ctx->ca_buf) >> PAGE_SHIFT;
+       if (remap_pfn_range(vma, vma->vm_start, pfn, len, vma->vm_page_prot))
+               return -EAGAIN;
+       dax_map_dbg("mmapped completion area at uva 0x%lx",  vma->vm_start);
+       return 0;
+}
+
+int dax_map_segment_common(unsigned long size,
+                          u32 *ccb_addr_type, char *name,
+                          u32 addr_sel, union ccb *ccbp,
+                          struct dax_ctx *dax_ctx)
+{
+       struct dax_vma *dv = NULL;
+       struct vm_area_struct *vma;
+       unsigned long virtp = ccbp->dwords[addr_sel];
+
+       dax_map_dbg("%s uva 0x%lx, size=0x%lx", name, virtp, size);
+       vma = find_vma(dax_ctx->dax_mm->this_mm, virtp);
+
+       if (vma == NULL)
+               return -1;
+
+       dv = vma->vm_private_data;
+
+       /* Only memory allocated by dax_alloc_ram has dax_vm_ops set */
+       if (dv == NULL || vma->vm_ops != &dax_vm_ops)
+               return -1;
+
+       /*
+        * check if user provided size is within the vma bounds.
+        */
+       if ((virtp + size) > vma->vm_end) {
+               dax_err("%s buffer 0x%lx+0x%lx overflows page 0x%lx+0x%lx",
+                       name, virtp, size, dv->pa, dv->length);
+               return -1;
+       }
+
+       dax_vm_print("matched", dv);
+       if (dax_no_flow_ctl) {
+               *ccb_addr_type = CCB_AT_VA;
+               ccbp->dwords[addr_sel] = (unsigned long)dv->kva +
+                                       (virtp - vma->vm_start);
+               dax_map_dbg("changed %s to KVA 0x%llx", name,
+                           ccbp->dwords[addr_sel]);
+       } else {
+               *ccb_addr_type = CCB_AT_RA;
+               ccbp->dwords[addr_sel] = dv->pa +
+                       (virtp - vma->vm_start);
+               dax_map_dbg("changed %s to RA 0x%llx", name,
+                           ccbp->dwords[addr_sel]);
+       }
+
+       return 0;
+}
+
+/*
+ * Look for use of special dax contiguous segment and
+ * set it up for physical access
+ */
+void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len)
+{
+       int i;
+       int nelem = CCB_BYTE_TO_NCCB(ccb_len);
+       struct ccb_data_acc_ctl *access;
+       unsigned long size;
+       u32 ccb_addr_type;
+
+       for (i = 0; i < nelem; i++) {
+               union ccb *ccbp = &ccb[i];
+               struct ccb_hdr *hdr = CCB_HDR(ccbp);
+               u32 idx;
+
+               /* index into ccb_buf */
+               idx = &ccb[i] - dax_ctx->ccb_buf;
+
+               dax_dbg("ccb[%d]=0x%p, idx=%d, at_dst=%d",
+                       i, ccbp, idx, hdr->at_dst);
+               if (hdr->at_dst == CCB_AT_VA_ALT) {
+                       access = (struct ccb_data_acc_ctl *)
+                               &ccbp->dwords[QUERY_DWORD_DAC];
+                       /* size in bytes */
+                       size = DAX_OUT_SIZE_FROM_CCB(access->output_buf_sz);
+
+                       if (dax_map_segment_common(size, &ccb_addr_type, "dst",
+                                                  QUERY_DWORD_OUTPUT, ccbp,
+                                                  dax_ctx) == 0) {
+                               hdr->at_dst = ccb_addr_type;
+                               /* enforce flow limit */
+                               if (hdr->at_dst == CCB_AT_RA)
+                                       access->flow_ctl =
+                                               DAX_BUF_LIMIT_FLOW_CTL;
+                       }
+               }
+
+               if (hdr->at_src0 == CCB_AT_VA_ALT) {
+                       access = (struct ccb_data_acc_ctl *)
+                                 &ccbp->dwords[QUERY_DWORD_DAC];
+                       /* size in bytes */
+                       size = DAX_IN_SIZE_FROM_CCB(access->input_cnt);
+                       if (dax_map_segment_common(size, &ccb_addr_type, "src0",
+                                               QUERY_DWORD_INPUT, ccbp,
+                                               dax_ctx) == 0)
+                               hdr->at_src0 = ccb_addr_type;
+               }
+
+               if (hdr->at_src1 == CCB_AT_VA_ALT)
+                       if (dax_map_segment_common(0, &ccb_addr_type, "src1",
+                                                  QUERY_DWORD_SEC_INPUT, ccbp,
+                                                  dax_ctx) == 0)
+                               hdr->at_src1 = ccb_addr_type;
+
+               if (hdr->at_tbl == CCB_AT_VA_ALT)
+                       if (dax_map_segment_common(0, &ccb_addr_type, "tbl",
+                                                  QUERY_DWORD_TBL, ccbp,
+                                                  dax_ctx) == 0)
+                               hdr->at_tbl = ccb_addr_type;
+
+               /* skip over 2nd 64 bytes of long CCB */
+               if (IS_LONG_CCB(ccbp))
+                       i++;
+       }
+}
+
+int dax_alloc_page_arrays(struct dax_ctx *ctx)
+{
+       int i;
+
+       for (i = 0; i < AT_MAX ; i++) {
+               ctx->pages[i] = vzalloc(DAX_CCB_BUF_NELEMS *
+                                       sizeof(struct page *));
+               if (ctx->pages[i] == NULL) {
+                       dax_dealloc_page_arrays(ctx);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+void dax_dealloc_page_arrays(struct dax_ctx *ctx)
+{
+       int i;
+
+       for (i = 0; i < AT_MAX ; i++) {
+               if (ctx->pages[i] != NULL)
+                       vfree(ctx->pages[i]);
+               ctx->pages[i] = NULL;
+       }
+}
+
+
+void dax_unlock_pages_ccb(struct dax_ctx *ctx, int ccb_num, union ccb *ccbp,
+                         bool warn)
+{
+       int i;
+
+       for (i = 0; i < AT_MAX ; i++) {
+               if (ctx->pages[i][ccb_num]) {
+                       set_page_dirty(ctx->pages[i][ccb_num]);
+                       put_page(ctx->pages[i][ccb_num]);
+                       dax_dbg("freeing page %p", ctx->pages[i][ccb_num]);
+                       ctx->pages[i][ccb_num] = NULL;
+               } else if (warn) {
+                       struct ccb_hdr *hdr = CCB_HDR(ccbp);
+
+                       WARN((hdr->at_dst == CCB_AT_VA_ALT && i == AT_DST) ||
+                            (hdr->at_src0 == CCB_AT_VA_ALT && i == AT_SRC0) ||
+                            (hdr->at_src1 == CCB_AT_VA_ALT && i == AT_SRC1) ||
+                            (hdr->at_tbl == CCB_AT_VA_ALT && i == AT_TBL),
+                            "page[%d][%d] for 0x%llx not locked",
+                            i, ccb_num,
+                            ccbp->dwords[dax_at_to_ccb_idx[i]]);
+               }
+       }
+}
+
+static int dax_lock_pages_at(struct dax_ctx *ctx, int ccb_num,
+                            union ccb *ccbp, int addr_sel, enum dax_at at,
+                            int idx)
+{
+       int nr_pages = 1;
+       int res;
+       struct page *page;
+       unsigned long virtp = ccbp[ccb_num].dwords[addr_sel];
+
+       if (virtp == 0)
+               return 0;
+
+       down_read(&current->mm->mmap_sem);
+       res = get_user_pages_fast(virtp,
+                            nr_pages, 1, &page);
+       up_read(&current->mm->mmap_sem);
+
+       if (res == nr_pages) {
+               ctx->pages[at][idx] = page;
+               dax_dbg("locked page %p, for VA 0x%lx",
+                       page, virtp);
+       } else {
+               dax_err("get_user_pages failed, virtp=0x%lx, nr_pages=%d, res=%d",
+                       virtp, nr_pages, res);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+/*
+ * Lock user pages. They get released during the dequeue phase
+ * or upon device close.
+ */
+int dax_lock_pages(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len)
+{
+       int tmp, i;
+       int ret = 0;
+       int nelem = CCB_BYTE_TO_NCCB(ccb_len);
+
+       for (i = 0; i < nelem; i++) {
+               struct ccb_hdr *hdr = CCB_HDR(&ccb[i]);
+               u32 idx;
+
+               /* index into ccb_buf */
+               idx = &ccb[i] - dax_ctx->ccb_buf;
+
+               dax_dbg("ccb[%d]=0x%p, idx=%d, at_dst=%d, at_src0=%d, at_src1=%d, at_tbl=%d",
+                        i, &ccb[i], idx, hdr->at_dst, hdr->at_src0,
+                        hdr->at_src1, hdr->at_tbl);
+
+               /* look at all addresses in hdr*/
+               if (hdr->at_dst == CCB_AT_VA_ALT) {
+                       ret = dax_lock_pages_at(dax_ctx, i, ccb,
+                                               dax_at_to_ccb_idx[AT_DST],
+                                               AT_DST,
+                                               idx);
+                       if (ret != 0)
+                               break;
+               }
+
+               if (hdr->at_src0 == CCB_AT_VA_ALT) {
+                       ret = dax_lock_pages_at(dax_ctx, i, ccb,
+                                               dax_at_to_ccb_idx[AT_SRC0],
+                                               AT_SRC0,
+                                               idx);
+                       if (ret != 0)
+                               break;
+               }
+
+               if (hdr->at_src1 == CCB_AT_VA_ALT) {
+                       ret = dax_lock_pages_at(dax_ctx, i, ccb,
+                                               dax_at_to_ccb_idx[AT_SRC1],
+                                               AT_SRC1,
+                                               idx);
+                       if (ret != 0)
+                               break;
+               }
+
+               if (hdr->at_tbl == CCB_AT_VA_ALT) {
+                       ret = dax_lock_pages_at(dax_ctx, i, ccb,
+                                               dax_at_to_ccb_idx[AT_TBL],
+                                               AT_TBL, idx);
+                       if (ret != 0)
+                               break;
+               }
+
+               /*
+                * Hypervisor does the TLB or TSB walk
+                * and expects the translation to be present
+                * in either of them.
+                */
+               if (hdr->at_dst == CCB_AT_VA_ALT &&
+                   copy_from_user(&tmp, (void __user *)
+                                  ccb[i].dwords[QUERY_DWORD_OUTPUT], 1)) {
+                       dax_dbg("ccb[%d]=0x%p, idx=%d", i, &ccb[i], idx);
+                       dax_dbg("bad OUTPUT address 0x%llx",
+                               ccb[i].dwords[QUERY_DWORD_OUTPUT]);
+               }
+
+               if (hdr->at_src0 == CCB_AT_VA_ALT &&
+                   copy_from_user(&tmp, (void __user *)
+                                  ccb[i].dwords[QUERY_DWORD_INPUT], 1)) {
+                       dax_dbg("ccb[%d]=0x%p, idx=%d", i, &ccb[i], idx);
+                       dax_dbg("bad INPUT address 0x%llx",
+                               ccb[i].dwords[QUERY_DWORD_INPUT]);
+               }
+
+               if (hdr->at_src1 == CCB_AT_VA_ALT &&
+                   copy_from_user(&tmp, (void __user *)
+                                  ccb[i].dwords[QUERY_DWORD_SEC_INPUT], 1)) {
+                       dax_dbg("ccb[%d]=0x%p, idx=%d", i, &ccb[i], idx);
+                       dax_dbg("bad SEC_INPUT address 0x%llx",
+                               ccb[i].dwords[QUERY_DWORD_SEC_INPUT]);
+               }
+
+               if (hdr->at_tbl == CCB_AT_VA_ALT &&
+                   copy_from_user(&tmp, (void __user *)
+                                  ccb[i].dwords[QUERY_DWORD_TBL], 1)) {
+                       dax_dbg("ccb[%d]=0x%p, idx=%d", i, &ccb[i], idx);
+                       dax_dbg("bad TBL address 0x%llx",
+                               ccb[i].dwords[QUERY_DWORD_TBL]);
+               }
+
+               /* skip over 2nd 64 bytes of long CCB */
+               if (IS_LONG_CCB(&ccb[i]))
+                       i++;
+       }
+       if (ret)
+               dax_unlock_pages(dax_ctx, ccb, ccb_len);
+
+       return ret;
+}
+
+/*
+ * Unlock user pages. Called during dequeue or device close.
+ */
+void dax_unlock_pages(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len)
+{
+       int i;
+       int nelem = CCB_BYTE_TO_NCCB(ccb_len);
+
+       for (i = 0; i < nelem; i++) {
+               u32 idx;
+
+               /* index into ccb_buf */
+               idx = &ccb[i] - dax_ctx->ccb_buf;
+               dax_unlock_pages_ccb(dax_ctx, idx, ccb, false);
+       }
+}
+
+int dax_address_in_use(struct dax_vma *dv, u32 addr_type,
+                             unsigned long addr)
+{
+       if (addr_type == CCB_AT_VA) {
+               unsigned long virtp = addr;
+
+               if (virtp >= (unsigned long)dv->kva &&
+                   virtp < (unsigned long)dv->kva + dv->length)
+                       return 1;
+       } else if (addr_type == CCB_AT_RA) {
+               unsigned long physp = addr;
+
+               if (physp >= dv->pa && physp < dv->pa + dv->length)
+                       return 1;
+       }
+
+       return 0;
+}
+
+
+/*
+ * open function called if the vma is split;
+ * usually happens in response to a partial munmap()
+ */
+void dax_vm_open(struct vm_area_struct *vma)
+{
+       dax_map_dbg("call with va=0x%lx, len=0x%lx",
+                   vma->vm_start, vma->vm_end - vma->vm_start);
+       dax_map_dbg("prot=0x%lx, flags=0x%lx",
+                   pgprot_val(vma->vm_page_prot), vma->vm_flags);
+}
+
+static void dax_vma_drain(struct dax_vma *dv)
+{
+       struct dax_mm *dax_mm;
+       struct dax_ctx *ctx;
+       struct list_head *p;
+
+       /* iterate over all threads in this process and drain all */
+       dax_mm = dv->dax_mm;
+       list_for_each(p, &dax_mm->ctx_list) {
+               ctx = list_entry(p, struct dax_ctx, ctx_list);
+               dax_ccbs_drain(ctx, dv);
+       }
+}
+
+void dax_vm_close(struct vm_area_struct *vma)
+{
+       struct dax_vma *dv;
+       struct dax_mm  *dm;
+
+       dv = vma->vm_private_data;
+       dax_map_dbg("vma=%p, dv=%p", vma, dv);
+       if (dv == NULL) {
+               dax_alert("dv NULL in dax_vm_close");
+               return;
+       }
+       if (dv->vma != vma) {
+               dax_map_dbg("munmap(0x%lx, 0x%lx) differs from mmap length 0x%lx",
+                            vma->vm_start, vma->vm_end - vma->vm_start,
+                            dv->length);
+               return;
+       }
+
+       dm = dv->dax_mm;
+       if (dm == NULL) {
+               dax_alert("dv->dax_mm NULL in dax_vm_close");
+               return;
+       }
+
+       dax_vm_print("freeing", dv);
+       spin_lock(&dm->lock);
+       vma->vm_private_data = NULL;
+
+       /* signifies no mapping exists and prevents new transactions */
+       dv->vma = NULL;
+       dax_vma_drain(dv);
+
+       kfree(dv->kva);
+       atomic_sub(dv->length / 1024, &dax_requested_mem);
+       kfree(dv);
+       dm->vma_count--;
+       atomic_dec(&dax_alloc_counter);
+
+       if (dax_clean_dm(dm))
+               spin_unlock(&dm->lock);
+}
+
+int dax_clean_dm(struct dax_mm *dm)
+{
+       /* if ctx list is empty, clean up this struct dax_mm */
+       if (list_empty(&dm->ctx_list)) {
+               spin_lock(&dm_list_lock);
+               list_del(&dm->mm_list);
+               dax_list_dbg("freeing dm with vma_count=%d, ctx_count=%d",
+                             dm->vma_count, dm->ctx_count);
+               kfree(dm);
+               spin_unlock(&dm_list_lock);
+               return 0;
+       }
+
+       return -1;
+}
+
diff --git a/arch/sparc/dax/dax_perf.c b/arch/sparc/dax/dax_perf.c
new file mode 100644 (file)
index 0000000..0b879c7
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "dax_impl.h"
+#include <asm/pcr.h>
+
+/*
+ * Performance Counter Code
+ *
+ * Author: Dave Aldridge (david.j.aldridge@oracle.com)
+ *
+ */
+
+/**
+ * write_pcr_reg() - Write to a performance counter register
+ * @register:  The register to write to
+ * @value:     The value to write
+ *
+ * Return:     0 - success
+ *             non 0 - failure
+ */
+static void write_pcr_reg(unsigned long reg, u64 value)
+{
+       dax_perf_dbg("initial pcr%lu[%016llx]", reg, pcr_ops->read_pcr(reg));
+
+       pcr_ops->write_pcr(reg, value);
+       dax_perf_dbg("updated pcr%lu[%016llx]", reg, pcr_ops->read_pcr(reg));
+}
+
+
+/**
+ * dax_setup_counters() - Setup the DAX performance counters
+ * @node:      The node
+ * @dax:       The dax instance
+ * @setup:     The config value to write
+ *
+ * Return:     0 - success
+ *             non 0 - failure
+ */
+static void dax_setup_counters(unsigned int node, unsigned int dax, u64 setup)
+{
+       write_pcr_reg(DAX_PERF_CTR_CTL_OFFSET(node, dax), setup);
+}
+
+/**
+ * @dax_get_counters() - Read the DAX performance counters
+ * @node:      The node
+ * @dax:       The dax instance
+ * @counts:    Somewhere to write the count values
+ *
+ * Return:     0 - success
+ *             non 0 - failure
+ */
+static void dax_get_counters(unsigned int node, unsigned int dax,
+               unsigned long (*counts)[DAX_PER_NODE][COUNTERS_PER_DAX])
+{
+       int i;
+       u64 pcr;
+       unsigned long reg;
+
+       for (i = 0; i < COUNTERS_PER_DAX; i++) {
+               reg = DAX_PERF_CTR_OFFSET(i, node, dax);
+               pcr = pcr_ops->read_pcr(reg);
+               dax_perf_dbg("pcr%lu[%016llx]", reg, pcr);
+               counts[node][dax][i] = pcr;
+       }
+}
+
+/**
+ * @dax_clear_counters() - Clear the DAX performance counters
+ * @node:      The node
+ * @dax:       The dax instance
+ *
+ * Return      0 - success
+ *             non 0 - failure
+ */
+static void dax_clear_counters(unsigned int node, unsigned int dax)
+{
+       int i;
+
+       for (i = 0; i < COUNTERS_PER_DAX; i++)
+               write_pcr_reg(DAX_PERF_CTR_OFFSET(i, node, dax), 0);
+}
+
+
+long dax_perfcount_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+       int ret = 0;
+       unsigned int node, dax;
+       unsigned int max_nodes = num_online_nodes();
+       unsigned long dax_config;
+       /* DAX performance counters are 48 bits wide */
+       unsigned long dax_count_bytes =
+               max_nodes * DAX_PER_NODE * COUNTERS_PER_DAX * sizeof(u64);
+
+       /* Somewhere to store away the dax performance counter 48 bit values */
+       unsigned long (*dax_counts)[DAX_PER_NODE][COUNTERS_PER_DAX];
+
+       switch (cmd) {
+       case DAXIOC_PERF_GET_NODE_COUNT:
+
+               dax_perf_dbg("DAXIOC_PERF_GET_NODE_COUNT: nodes = %u",
+                            max_nodes);
+
+               if (copy_to_user((void __user *)(void *)arg, &max_nodes,
+                                sizeof(max_nodes)))
+                       return -EFAULT;
+
+               return 0;
+
+       case DAXIOC_PERF_SET_COUNTERS:
+
+               dax_perf_dbg("DAXIOC_PERF_SET_COUNTERS");
+
+               /* Get the performance counter setup from user land */
+               if (copy_from_user(&dax_config, (void __user *)arg,
+                                  sizeof(unsigned long)))
+                       return -EFAULT;
+
+               /* Setup the dax performance counter configuration registers */
+               dax_perf_dbg("DAXIOC_PERF_SET_COUNTERS: dax_config = 0x%lx",
+                       dax_config);
+
+               for (node = 0; node < max_nodes; node++)
+                       for (dax = 0; dax < DAX_PER_NODE; dax++)
+                               dax_setup_counters(node, dax, dax_config);
+
+               return 0;
+
+       case DAXIOC_PERF_GET_COUNTERS:
+
+               dax_perf_dbg("DAXIOC_PERF_GET_COUNTERS");
+
+               /* Somewhere to store the count data */
+               dax_counts = kmalloc(dax_count_bytes, GFP_KERNEL);
+               if (!dax_counts)
+                       return -ENOMEM;
+
+               /* Read the counters */
+               for (node = 0; node < max_nodes; node++)
+                       for (dax = 0; dax < DAX_PER_NODE; dax++)
+                               dax_get_counters(node, dax, dax_counts);
+
+               dax_perf_dbg("DAXIOC_PERF_GET_COUNTERS: copying %lu bytes of perf counter data",
+                       dax_count_bytes);
+
+               if (copy_to_user((void __user *)(void *)arg, dax_counts,
+                                dax_count_bytes))
+                       ret = -EFAULT;
+
+               kfree(dax_counts);
+               return ret;
+
+       case DAXIOC_PERF_CLEAR_COUNTERS:
+
+               dax_perf_dbg("DAXIOC_PERF_CLEAR_COUNTERS");
+
+               /* Clear the counters */
+               for (node = 0; node < max_nodes; node++)
+                       for (dax = 0; dax < DAX_PER_NODE; dax++)
+                               dax_clear_counters(node, dax);
+
+               return 0;
+
+       default:
+               dax_dbg("Invalid command: 0x%x", cmd);
+               return -ENOTTY;
+       }
+}
diff --git a/arch/sparc/dax/sys_dax.h b/arch/sparc/dax/sys_dax.h
new file mode 100644 (file)
index 0000000..8f18099
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#ifndef _SYS_DAX_H
+#define _SYS_DAX_H
+
+#ifdef __KERNEL__
+#include "ccb.h"
+#else
+#include <ccb.h>
+#endif
+#include <linux/types.h>
+
+/* DAXIOC_CCB_EXEC dce_ccb_status */
+#define        DAX_SUBMIT_OK                           0
+#define        DAX_SUBMIT_ERR_RETRY                    1
+#define        DAX_SUBMIT_ERR_WOULDBLOCK               2
+#define        DAX_SUBMIT_ERR_BUSY                     3
+#define        DAX_SUBMIT_ERR_THR_INIT                 4
+#define        DAX_SUBMIT_ERR_ARG_INVAL                5
+#define        DAX_SUBMIT_ERR_CCB_INVAL                6
+#define        DAX_SUBMIT_ERR_NO_CA_AVAIL              7
+#define        DAX_SUBMIT_ERR_CCB_ARR_MMU_MISS         8
+#define        DAX_SUBMIT_ERR_NOMAP                    9
+#define        DAX_SUBMIT_ERR_NOACCESS                 10
+#define        DAX_SUBMIT_ERR_TOOMANY                  11
+#define        DAX_SUBMIT_ERR_UNAVAIL                  12
+#define        DAX_SUBMIT_ERR_INTERNAL                 13
+
+
+#define        DAX_DEV "/dev/dax"
+#define DAX_DRIVER_VERSION 3
+
+/*
+ * dax device ioctl commands
+ */
+#define        DAXIOC  'D'
+
+/* Deprecated IOCTL numbers */
+#define        DAXIOC_DEP_1    _IOWR(DAXIOC, 1, struct dax_ccb_thr_init_arg)
+#define        DAXIOC_DEP_3    _IOWR(DAXIOC, 3, struct dax_ca_dequeue_arg)
+#define        DAXIOC_DEP_4    _IOWR(DAXIOC, 4, struct dax_ccb_exec_arg)
+
+/* CCB thread initialization */
+#define        DAXIOC_CCB_THR_INIT     _IOWR(DAXIOC, 6, struct dax_ccb_thr_init_arg)
+/* free CCB thread resources */
+#define        DAXIOC_CCB_THR_FINI     _IO(DAXIOC,   2)
+/* CCB CA dequeue */
+#define        DAXIOC_CA_DEQUEUE       _IOWR(DAXIOC, 7, struct dax_ca_dequeue_arg)
+/* CCB execution */
+#define        DAXIOC_CCB_EXEC         _IOWR(DAXIOC, 8, struct dax_ccb_exec_arg)
+/* get driver version */
+#define DAXIOC_VERSION          _IOWR(DAXIOC, 5, long)
+
+/*
+ * Perf Counter defines
+ */
+#define DAXIOC_PERF_GET_NODE_COUNT     _IOR(DAXIOC, 0xB0, void *)
+#define DAXIOC_PERF_SET_COUNTERS       _IOW(DAXIOC, 0xBA, void *)
+#define DAXIOC_PERF_GET_COUNTERS       _IOR(DAXIOC, 0xBB, void *)
+#define DAXIOC_PERF_CLEAR_COUNTERS     _IOW(DAXIOC, 0xBC, void *)
+
+/*
+ * DAXIOC_CCB_THR_INIT
+ * dcti_ccb_buf_maxlen - return u32 length
+ * dcti_compl_maplen - return u64 mmap length
+ * dcti_compl_mapoff - return u64 mmap offset
+ */
+struct dax_ccb_thr_init_arg {
+       u32 dcti_ccb_buf_maxlen;
+       u64 dcti_compl_maplen;
+       u64 dcti_compl_mapoff;
+};
+
+/*
+ * DAXIOC_CCB_EXEC
+ * dce_ccb_buf_len : user buffer length in bytes
+ * *dce_ccb_buf_addr : user buffer address
+ * dce_submitted_ccb_buf_len : CCBs in bytes submitted to the DAX HW
+ * dce_ca_region_off : return offset to the completion area of the first
+ *                     ccb submitted in DAXIOC_CCB_EXEC ioctl
+ * dce_ccb_status : return u32 CCB status defined above (see DAX_SUBMIT_*)
+ * dce_nomap_va : bad virtual address when ret is NOMAP or NOACCESS
+ */
+struct dax_ccb_exec_arg {
+       u32     dce_ccb_buf_len;
+       void    *dce_ccb_buf_addr;
+       u32     dce_submitted_ccb_buf_len;
+       u64     dce_ca_region_off;
+       u32     dce_ccb_status;
+       u64     dce_nomap_va;
+};
+
+/*
+ * DAXIOC_CA_DEQUEUE
+ * dcd_len_requested : byte len of CA to dequeue
+ * dcd_len_dequeued : byte len of CAs dequeued by the driver
+ */
+struct dax_ca_dequeue_arg {
+       u32 dcd_len_requested;
+       u32 dcd_len_dequeued;
+};
+
+
+/* The number of DAX engines per node */
+#define DAX_PER_NODE           (8)
+
+/* The number of performance counters
+ * per DAX engine
+ */
+#define COUNTERS_PER_DAX       (3)
+
+#endif /* _SYS_DAX_H */
index ed6d2f537036ea6943760db5986a8cfda5574a2c..b77770d1f2928c170ef5f43b988f9ea33f7b8c76 100644 (file)
 
 #define HV_EUNBOUND                    19 /* Resource is unbound          */
 
+#define HV_EUNAVAILABLE                        23 /* Resource or operation not
+                                           * currently available, but may
+                                           * become available in the future
+                                           */
+
+
 /* mach_exit()
  * TRAP:       HV_FAST_TRAP
  * FUNCTION:   HV_FAST_MACH_EXIT