]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
sparc64: DAX memory will use RA+PGSZ feature in HV
authorRob Gardner <rob.gardner@oracle.com>
Fri, 21 Apr 2017 23:10:34 +0000 (17:10 -0600)
committerChuck Anderson <chuck.anderson@oracle.com>
Fri, 5 May 2017 16:53:04 +0000 (09:53 -0700)
Orabug: 25911008
Orabug: 25931417

The reported kernel panics and "other oddities" are caused by
corruption of kernel memory by DAX output. This is happening due to an
apparent change between UEK2 and UEK4, whereby the underlying h/w page
size for memory acquired via kmalloc has changed. UEK2 used 4mb pages,
which the dax driver used to limit the output from the coprocessor,
who refuses to cross "page boundaries". But in UEK4 it appears that a
more intelligent approach to memory is used, and kernel memory may be
backed by a variety of huge h/w page sizes, ie, 256mb and 2gb. This
now allows DAX to produce output up to this much larger page size,
thus going beyond the actual allocation.  We do not have any way to
kmalloc memory with a certain backing page size, and we cannot feed
DAX a virtual address if we are not certain of its page size.

Recent hypervisor f/w has provided a powerful new feature: the ability
to convey page size bits along with a real address (RA). This gives us
the opportunity to avoid using the TLB/TSB as a parameter passing
mechanism and we can use this to avoid using virtual addresses at all
in a DAX ccb. We now use this mechanism to set the page size for
dax_alloc memory to 4mb even if the underlying h/w page size for the
memory is much larger. Memory allocated by the application via
conventional methods is not affected.

This HV feature is available on M7 f/w with minor number 1, so this is
used to determine if the driver can provide the memory allocation
service. If the feature is not available, DAX will still work, but all
the responsibility for memory allocation falls back to the
application.

The sudden ENOACCESS errors are a result of another hypervisor change.
Newest HV firmware has begun to enforce the privileged flag (bit 14)
in arg2 of the ccb_submit hypercall. This flag is described in the API
wiki as "CCB virtual addresses are treated as privileged" and in the
VM spec as "Virtual addresses within CCBs are translated in privileged
context".  The explanation given later in the VM spec is that if a CCB
contains any virtual address whose TTE has the priv bit set
(_PAGE_P_4V), then the priv flag in the ccb_submit api must also be
set, or else the hypervisor will refuse to perform the translation,
and an ENOACCESS error will be thrown. Since privileged virtual
addresses are no longer used as a result of this very commit, this
problem simply disappears.

Signed-off-by: Rob Gardner <rob.gardner@oracle.com>
Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
arch/sparc/dax/dax_impl.h
arch/sparc/dax/dax_main.c
arch/sparc/dax/dax_misc.c
arch/sparc/dax/dax_mm.c

index 045cc92b30ff9ff7f9500abfc85a1e6fe184b848..fb310fd6d1c4ba3aa869b1f8d8d9fb25b40369fd 100644 (file)
@@ -34,7 +34,7 @@
 #include "ccb.h"
 #include "sys_dax.h"
 
-extern bool dax_no_flow_ctl;
+extern bool dax_no_flow_ctl, dax_no_ra_pgsz;
 extern int dax_debug;
 extern atomic_t dax_alloc_counter;
 extern atomic_t dax_actual_mem;
@@ -132,7 +132,10 @@ extern const struct vm_operations_struct dax_vm_ops;
 #define        CCB_HDR(ccb)            ((struct ccb_hdr *)(ccb))
 #define        IS_LONG_CCB(ccb)        ((CCB_HDR(ccb))->sync_flags & CCB_SYNC_LONGCCB)
 /* VM spec 36.2.1.1.8 & 36.2.1.2 / PRM 23.7.1 */
-#define NO_PAGE_RANGE_CHECK (0xfLL << 56)
+#define PAGE_CHECK_SHIFT  56
+#define NO_PAGE_RANGE 0xfLL
+#define NO_PAGE_RANGE_CHECK  (NO_PAGE_RANGE << PAGE_CHECK_SHIFT)
+#define CHECK_4MB_PAGE_RANGE (_PAGE_SZ4MB_4V << PAGE_CHECK_SHIFT)
 
 #define        DAX_CCB_WAIT_USEC               100
 #define        DAX_CCB_WAIT_RETRIES_MAX        10000
index 078ea1ce92653f24580b6635a187d8e9c5795184..3fe8ca88190d30dea2056c2099ad327dfc42e1bd 100644 (file)
@@ -15,7 +15,7 @@ atomic_t dax_alloc_counter = ATOMIC_INIT(0);
 atomic_t dax_requested_mem = ATOMIC_INIT(0);
 
 int dax_debug;
-bool dax_no_flow_ctl;
+bool dax_no_flow_ctl, dax_no_ra_pgsz;
 
 /* driver public entry points */
 static long dax_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
@@ -141,6 +141,9 @@ static int __init dax_attach(void)
                                 DAX_MAJOR, minor);
        }
 
+       dax_no_ra_pgsz = (DAX_MAJOR == 1) && (minor == 0);
+       dax_dbg("RA pagesize feature %spresent", dax_no_ra_pgsz ? "not " : "");
+
        ret = hv_get_hwqueue_size(&max_ccbs);
        if (ret != 0) {
                dax_err("get_hwqueue_size failed with status=%d and max_ccbs=%ld",
@@ -170,7 +173,7 @@ static int __init dax_attach(void)
                dax_dbg("Flow control disabled by software, dax_alloc restricted to 4M");
                dax_no_flow_ctl = true;
        } else if ((dax_type == DAX1) && !dax_has_flow_ctl_numa()) {
-               dax_dbg("Flow control disabled by hardware, dax_alloc restricted to 4M");
+               dax_dbg("Flow control disabled by hardware, dax_alloc (if available) restricted to 4M");
                dax_no_flow_ctl = true;
        } else {
                dax_dbg("Flow control enabled");
index 89b38988f092c5c1a432bda3aebae431510c7360..443b3018397620c9718df404eda6994e774b635b 100644 (file)
@@ -57,7 +57,7 @@ static int dax_has_flow_ctl_one_node(void)
 
        ra = virt_to_phys(ccb);
 
-       hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_QUERY_CMD, 0,
+       hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_CCB_VA_PRIVILEGED | HV_DAX_QUERY_CMD, 0,
                                     &submitted_ccb_buf_len, &nomap_va);
        if (hv_rv != HV_EOK) {
                dax_info("failed dax submit, ret=0x%lx", hv_rv);
@@ -161,11 +161,9 @@ bool dax_has_flow_ctl_numa(void)
 
 void dax_overflow_check(struct dax_ctx *ctx, int idx)
 {
-       unsigned long output_size, input_size, virtp;
-       unsigned long page_size = PAGE_SIZE;
+       unsigned long virtp, page_size = PAGE_SIZE;
        struct ccb_hdr *hdr;
        union ccb     *ccb;
-       struct ccb_data_acc_ctl *access;
        struct vm_area_struct *vma;
        struct ccb_completion_area *ca = &ctx->ca_buf[idx];
 
@@ -181,80 +179,36 @@ void dax_overflow_check(struct dax_ctx *ctx, int idx)
        ccb = &ctx->ccb_buf[idx];
        hdr = CCB_HDR(ccb);
 
-       access = (struct ccb_data_acc_ctl *) &ccb->dwords[QUERY_DWORD_DAC];
-       output_size = access->output_buf_sz * 64 + 64;
-       input_size  = access->input_cnt + 1;
-
        dax_dbg("*************************");
        dax_dbg("*DAX Page Overflow Report:");
-       dax_dbg("*  Output size requested = 0x%lx, output size produced = 0x%x",
-               output_size, ca->output_sz);
-       dax_dbg("*  Input size requested = 0x%lx, input size processed = 0x%x",
-               input_size, ca->n_processed);
-       dax_dbg("*  User virtual address analysis:");
+       dax_dbg("*  Output size produced = 0x%x", ca->output_sz);
+       dax_dbg("*  Input size processed = 0x%x", ca->n_processed);
+       dax_dbg("*  Address analysis:");
 
        virtp = ccb->dwords[QUERY_DWORD_OUTPUT];
 
        if (hdr->at_dst == CCB_AT_RA) {
-               dax_dbg("*   Output address = 0x%lx physical, so no overflow possible",
-                       virtp);
-       } else {
-               /* output buffer was virtual, so page overflow is possible */
-               if (hdr->at_dst == CCB_AT_VA_ALT) {
-                       if (current->mm == NULL)
-                               return;
-
-                       vma = find_vma(current->mm, virtp);
-                       if (vma == NULL)
-                               dax_dbg("*   Output address = 0x%lx but is demapped, which precludes analysis",
-                                       virtp);
-                       else
-                               page_size = vma_kernel_pagesize(vma);
-               } else if (hdr->at_dst == CCB_AT_VA) {
-                       page_size = DAX_SYN_LARGE_PAGE_SIZE;
+               page_size = DAX_SYN_LARGE_PAGE_SIZE;
+       } else if (hdr->at_dst == CCB_AT_VA_ALT) {
+               if (current->mm == NULL)
+                       return;
+
+               vma = find_vma(current->mm, virtp);
+               if (vma == NULL) {
+                       dax_dbg("*   Output address = 0x%lx but is demapped, which precludes analysis",
+                               virtp);
+                       goto done;
+               } else {
+                       page_size = vma_kernel_pagesize(vma);
                }
+       } 
 
-               dax_dbg("*   Output address = 0x%lx, page size = 0x%lx; page overflow %s",
-                       virtp, page_size,
-                       (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ?
-                                        "LIKELY" : "UNLIKELY");
-               dax_dbg("*   Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx",
-                       ca->output_sz,
-                       (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ?
+       dax_dbg("*   Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx",
+               ca->output_sz,
+               (virtp + ca->output_sz > ALIGN(virtp + 1, page_size)) ?
                                         "OUTSIDE" : "WITHIN",
                        virtp, ALIGN(virtp + 1, page_size));
-       }
 
-       virtp = ccb->dwords[QUERY_DWORD_INPUT];
-       if (hdr->at_src0 == CCB_AT_RA) {
-               dax_dbg("*   Input address = 0x%lx physical, so no overflow possible",
-                       virtp);
-       } else {
-               if (hdr->at_src0 == CCB_AT_VA_ALT) {
-                       if (current->mm == NULL)
-                               return;
-
-                       vma = find_vma(current->mm, virtp);
-                       if (vma == NULL)
-                               dax_dbg("*   Input address = 0x%lx but is demapped, which precludes analysis",
-                                       virtp);
-                       else
-                               page_size = vma_kernel_pagesize(vma);
-               } else if (hdr->at_src0 == CCB_AT_VA) {
-                       page_size = DAX_SYN_LARGE_PAGE_SIZE;
-               }
-
-               dax_dbg("*   Input address = 0x%lx, page size = 0x%lx; page overflow %s",
-                       virtp, page_size,
-                       (virtp + input_size >=
-                        ALIGN(virtp + 1, page_size)) ?
-                                       "LIKELY" : "UNLIKELY");
-               dax_dbg("*   Input size processed (0x%x) is %s the page bounds 0x%lx..0x%lx",
-                       ca->n_processed,
-                       (virtp + ca->n_processed >=
-                        ALIGN(virtp + 1, page_size)) ?
-                                       "OUTSIDE" : "WITHIN",
-                       virtp, ALIGN(virtp + 1, page_size));
-       }
+done:
        dax_dbg("*************************");
 }
index 890e2c4d8e52352d8574fd62d181c2f646a75afc..1badae6d36e49f39dc945ae3584c9a66acab1f50 100644 (file)
@@ -35,6 +35,11 @@ static int dax_alloc_ram(struct file *filp, struct vm_area_struct *vma)
        int ret = -ENOMEM;
        struct dax_ctx *dax_ctx = (struct dax_ctx *) filp->private_data;
 
+       if (dax_no_ra_pgsz) {
+               ret = -ENODEV;
+               goto done;
+       }
+
        len = vma->vm_end - vma->vm_start;
        if (len & (PAGE_SIZE - 1)) {
                dax_err("request (0x%lx) not a multiple of page size", len);
@@ -158,8 +163,7 @@ int dax_devmap(struct file *f, struct vm_area_struct *vma)
        return 0;
 }
 
-int dax_map_segment_common(unsigned long size,
-                          u32 *ccb_addr_type, char *name,
+int dax_map_segment_common(u32 *ccb_addr_type, char *name,
                           u32 addr_sel, union ccb *ccbp,
                           struct dax_ctx *dax_ctx)
 {
@@ -167,7 +171,7 @@ int dax_map_segment_common(unsigned long size,
        struct vm_area_struct *vma;
        unsigned long virtp = ccbp->dwords[addr_sel];
 
-       dax_map_dbg("%s uva 0x%lx, size=0x%lx", name, virtp, size);
+       dax_map_dbg("%s uva 0x%lx", name, virtp);
        vma = find_vma(dax_ctx->dax_mm->this_mm, virtp);
 
        if (vma == NULL)
@@ -179,24 +183,12 @@ int dax_map_segment_common(unsigned long size,
        if (dv == NULL || vma->vm_ops != &dax_vm_ops)
                return -1;
 
-       /*
-        * check if user provided size is within the vma bounds.
-        */
-       if ((virtp + size) > vma->vm_end) {
-               dax_err("%s buffer 0x%lx+0x%lx overflows page 0x%lx+0x%lx",
-                       name, virtp, size, dv->pa, dv->length);
-               return -1;
-       }
-
        dax_vm_print("matched", dv);
        if (dax_no_flow_ctl) {
-               *ccb_addr_type = CCB_AT_VA;
-               ccbp->dwords[addr_sel] = (unsigned long)dv->kva +
-                                       (virtp - vma->vm_start);
-               /* touch va to fault translation into tlb/tsb */
-               READ_ONCE(*(u8 *)ccbp->dwords[addr_sel]);
-
-               dax_map_dbg("changed %s to KVA 0x%llx", name,
+               *ccb_addr_type = CCB_AT_RA;
+               ccbp->dwords[addr_sel] = CHECK_4MB_PAGE_RANGE |
+                       (dv->pa + (virtp - vma->vm_start));
+               dax_map_dbg("changed %s to RA 0x%llx", name,
                            ccbp->dwords[addr_sel]);
        } else {
                *ccb_addr_type = CCB_AT_RA;
@@ -217,8 +209,6 @@ void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len)
 {
        int i;
        int nelem = CCB_BYTE_TO_NCCB(ccb_len);
-       struct ccb_data_acc_ctl *access;
-       unsigned long size;
        u32 ccb_addr_type;
 
        for (i = 0; i < nelem; i++) {
@@ -232,37 +222,27 @@ void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len)
                dax_dbg("ccb[%d]=0x%p, idx=%d, at_dst=%d",
                        i, ccbp, idx, hdr->at_dst);
                if (hdr->at_dst == CCB_AT_VA_ALT) {
-                       access = (struct ccb_data_acc_ctl *)
-                               &ccbp->dwords[QUERY_DWORD_DAC];
-                       /* size in bytes */
-                       size = DAX_OUT_SIZE_FROM_CCB(access->output_buf_sz);
-
-                       if (dax_map_segment_common(size, &ccb_addr_type, "dst",
+                       if (dax_map_segment_common(&ccb_addr_type, "dst",
                                                   QUERY_DWORD_OUTPUT, ccbp,
-                                                  dax_ctx) == 0) {
+                                                  dax_ctx) == 0)
                                hdr->at_dst = ccb_addr_type;
-                               /* enforce flow limit */
-                               if (hdr->at_dst == CCB_AT_RA)
-                                       access->flow_ctl =
-                                               DAX_BUF_LIMIT_FLOW_CTL;
-                       }
                }
 
                if (hdr->at_src0 == CCB_AT_VA_ALT) {
-                       if (dax_map_segment_common(0, &ccb_addr_type, "src0",
-                                               QUERY_DWORD_INPUT, ccbp,
-                                               dax_ctx) == 0)
+                       if (dax_map_segment_common(&ccb_addr_type, "src0",
+                                                  QUERY_DWORD_INPUT, ccbp,
+                                                  dax_ctx) == 0)
                                hdr->at_src0 = ccb_addr_type;
                }
 
                if (hdr->at_src1 == CCB_AT_VA_ALT)
-                       if (dax_map_segment_common(0, &ccb_addr_type, "src1",
+                       if (dax_map_segment_common(&ccb_addr_type, "src1",
                                                   QUERY_DWORD_SEC_INPUT, ccbp,
                                                   dax_ctx) == 0)
                                hdr->at_src1 = ccb_addr_type;
 
                if (hdr->at_tbl == CCB_AT_VA_ALT)
-                       if (dax_map_segment_common(0, &ccb_addr_type, "tbl",
+                       if (dax_map_segment_common(&ccb_addr_type, "tbl",
                                                   QUERY_DWORD_TBL, ccbp,
                                                   dax_ctx) == 0)
                                hdr->at_tbl = ccb_addr_type;