From 3926ce77988ad3024493eae003c48970b768cf20 Mon Sep 17 00:00:00 2001 From: Rob Gardner Date: Fri, 21 Apr 2017 17:10:34 -0600 Subject: [PATCH] sparc64: DAX memory will use RA+PGSZ feature in HV Orabug: 25911008 Orabug: 25931417 The reported kernel panics and "other oddities" are caused by corruption of kernel memory by DAX output. This is happening due to an apparent change between UEK2 and UEK4, whereby the underlying h/w page size for memory acquired via kmalloc has changed. UEK2 used 4mb pages, which the dax driver used to limit the output from the coprocessor, who refuses to cross "page boundaries". But in UEK4 it appears that a more intelligent approach to memory is used, and kernel memory may be backed by a variety of huge h/w page sizes, ie, 256mb and 2gb. This now allows DAX to produce output up to this much larger page size, thus going beyond the actual allocation. We do not have any way to kmalloc memory with a certain backing page size, and we cannot feed DAX a virtual address if we are not certain of its page size. Recent hypervisor f/w has provided a powerful new feature: the ability to convey page size bits along with a real address (RA). This gives us the opportunity to avoid using the TLB/TSB as a parameter passing mechanism and we can use this to avoid using virtual addresses at all in a DAX ccb. We now use this mechanism to set the page size for dax_alloc memory to 4mb even if the underlying h/w page size for the memory is much larger. Memory allocated by the application via conventional methods is not affected. This HV feature is available on M7 f/w with minor number 1, so this is used to determine if the driver can provide the memory allocation service. If the feature is not available, DAX will still work, but all the responsibility for memory allocation falls back to the application. The sudden ENOACCESS errors are a result of another hypervisor change. Newest HV firmware has begun to enforce the privileged flag (bit 14) in arg2 of the ccb_submit hypercall. This flag is described in the API wiki as "CCB virtual addresses are treated as privileged" and in the VM spec as "Virtual addresses within CCBs are translated in privileged context". The explanation given later in the VM spec is that if a CCB contains any virtual address whose TTE has the priv bit set (_PAGE_P_4V), then the priv flag in the ccb_submit api must also be set, or else the hypervisor will refuse to perform the translation, and an ENOACCESS error will be thrown. Since privileged virtual addresses are no longer used as a result of this very commit, this problem simply disappears. Signed-off-by: Rob Gardner Reviewed-by: Jonathan Helman Signed-off-by: Allen Pais --- arch/sparc/dax/dax_impl.h | 7 ++- arch/sparc/dax/dax_main.c | 7 ++- arch/sparc/dax/dax_misc.c | 90 ++++++++++----------------------------- arch/sparc/dax/dax_mm.c | 56 ++++++++---------------- 4 files changed, 50 insertions(+), 110 deletions(-) diff --git a/arch/sparc/dax/dax_impl.h b/arch/sparc/dax/dax_impl.h index 045cc92b30ff..fb310fd6d1c4 100644 --- a/arch/sparc/dax/dax_impl.h +++ b/arch/sparc/dax/dax_impl.h @@ -34,7 +34,7 @@ #include "ccb.h" #include "sys_dax.h" -extern bool dax_no_flow_ctl; +extern bool dax_no_flow_ctl, dax_no_ra_pgsz; extern int dax_debug; extern atomic_t dax_alloc_counter; extern atomic_t dax_actual_mem; @@ -132,7 +132,10 @@ extern const struct vm_operations_struct dax_vm_ops; #define CCB_HDR(ccb) ((struct ccb_hdr *)(ccb)) #define IS_LONG_CCB(ccb) ((CCB_HDR(ccb))->sync_flags & CCB_SYNC_LONGCCB) /* VM spec 36.2.1.1.8 & 36.2.1.2 / PRM 23.7.1 */ -#define NO_PAGE_RANGE_CHECK (0xfLL << 56) +#define PAGE_CHECK_SHIFT 56 +#define NO_PAGE_RANGE 0xfLL +#define NO_PAGE_RANGE_CHECK (NO_PAGE_RANGE << PAGE_CHECK_SHIFT) +#define CHECK_4MB_PAGE_RANGE (_PAGE_SZ4MB_4V << PAGE_CHECK_SHIFT) #define DAX_CCB_WAIT_USEC 100 #define DAX_CCB_WAIT_RETRIES_MAX 10000 diff --git a/arch/sparc/dax/dax_main.c b/arch/sparc/dax/dax_main.c index 078ea1ce9265..3fe8ca88190d 100644 --- a/arch/sparc/dax/dax_main.c +++ b/arch/sparc/dax/dax_main.c @@ -15,7 +15,7 @@ atomic_t dax_alloc_counter = ATOMIC_INIT(0); atomic_t dax_requested_mem = ATOMIC_INIT(0); int dax_debug; -bool dax_no_flow_ctl; +bool dax_no_flow_ctl, dax_no_ra_pgsz; /* driver public entry points */ static long dax_ioctl(struct file *f, unsigned int cmd, unsigned long arg); @@ -141,6 +141,9 @@ static int __init dax_attach(void) DAX_MAJOR, minor); } + dax_no_ra_pgsz = (DAX_MAJOR == 1) && (minor == 0); + dax_dbg("RA pagesize feature %spresent", dax_no_ra_pgsz ? "not " : ""); + ret = hv_get_hwqueue_size(&max_ccbs); if (ret != 0) { dax_err("get_hwqueue_size failed with status=%d and max_ccbs=%ld", @@ -170,7 +173,7 @@ static int __init dax_attach(void) dax_dbg("Flow control disabled by software, dax_alloc restricted to 4M"); dax_no_flow_ctl = true; } else if ((dax_type == DAX1) && !dax_has_flow_ctl_numa()) { - dax_dbg("Flow control disabled by hardware, dax_alloc restricted to 4M"); + dax_dbg("Flow control disabled by hardware, dax_alloc (if available) restricted to 4M"); dax_no_flow_ctl = true; } else { dax_dbg("Flow control enabled"); diff --git a/arch/sparc/dax/dax_misc.c b/arch/sparc/dax/dax_misc.c index 89b38988f092..443b30183976 100644 --- a/arch/sparc/dax/dax_misc.c +++ b/arch/sparc/dax/dax_misc.c @@ -57,7 +57,7 @@ static int dax_has_flow_ctl_one_node(void) ra = virt_to_phys(ccb); - hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_QUERY_CMD, 0, + hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_CCB_VA_PRIVILEGED | HV_DAX_QUERY_CMD, 0, &submitted_ccb_buf_len, &nomap_va); if (hv_rv != HV_EOK) { dax_info("failed dax submit, ret=0x%lx", hv_rv); @@ -161,11 +161,9 @@ bool dax_has_flow_ctl_numa(void) void dax_overflow_check(struct dax_ctx *ctx, int idx) { - unsigned long output_size, input_size, virtp; - unsigned long page_size = PAGE_SIZE; + unsigned long virtp, page_size = PAGE_SIZE; struct ccb_hdr *hdr; union ccb *ccb; - struct ccb_data_acc_ctl *access; struct vm_area_struct *vma; struct ccb_completion_area *ca = &ctx->ca_buf[idx]; @@ -181,80 +179,36 @@ void dax_overflow_check(struct dax_ctx *ctx, int idx) ccb = &ctx->ccb_buf[idx]; hdr = CCB_HDR(ccb); - access = (struct ccb_data_acc_ctl *) &ccb->dwords[QUERY_DWORD_DAC]; - output_size = access->output_buf_sz * 64 + 64; - input_size = access->input_cnt + 1; - dax_dbg("*************************"); dax_dbg("*DAX Page Overflow Report:"); - dax_dbg("* Output size requested = 0x%lx, output size produced = 0x%x", - output_size, ca->output_sz); - dax_dbg("* Input size requested = 0x%lx, input size processed = 0x%x", - input_size, ca->n_processed); - dax_dbg("* User virtual address analysis:"); + dax_dbg("* Output size produced = 0x%x", ca->output_sz); + dax_dbg("* Input size processed = 0x%x", ca->n_processed); + dax_dbg("* Address analysis:"); virtp = ccb->dwords[QUERY_DWORD_OUTPUT]; if (hdr->at_dst == CCB_AT_RA) { - dax_dbg("* Output address = 0x%lx physical, so no overflow possible", - virtp); - } else { - /* output buffer was virtual, so page overflow is possible */ - if (hdr->at_dst == CCB_AT_VA_ALT) { - if (current->mm == NULL) - return; - - vma = find_vma(current->mm, virtp); - if (vma == NULL) - dax_dbg("* Output address = 0x%lx but is demapped, which precludes analysis", - virtp); - else - page_size = vma_kernel_pagesize(vma); - } else if (hdr->at_dst == CCB_AT_VA) { - page_size = DAX_SYN_LARGE_PAGE_SIZE; + page_size = DAX_SYN_LARGE_PAGE_SIZE; + } else if (hdr->at_dst == CCB_AT_VA_ALT) { + if (current->mm == NULL) + return; + + vma = find_vma(current->mm, virtp); + if (vma == NULL) { + dax_dbg("* Output address = 0x%lx but is demapped, which precludes analysis", + virtp); + goto done; + } else { + page_size = vma_kernel_pagesize(vma); } + } - dax_dbg("* Output address = 0x%lx, page size = 0x%lx; page overflow %s", - virtp, page_size, - (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ? - "LIKELY" : "UNLIKELY"); - dax_dbg("* Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx", - ca->output_sz, - (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ? + dax_dbg("* Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx", + ca->output_sz, + (virtp + ca->output_sz > ALIGN(virtp + 1, page_size)) ? "OUTSIDE" : "WITHIN", virtp, ALIGN(virtp + 1, page_size)); - } - virtp = ccb->dwords[QUERY_DWORD_INPUT]; - if (hdr->at_src0 == CCB_AT_RA) { - dax_dbg("* Input address = 0x%lx physical, so no overflow possible", - virtp); - } else { - if (hdr->at_src0 == CCB_AT_VA_ALT) { - if (current->mm == NULL) - return; - - vma = find_vma(current->mm, virtp); - if (vma == NULL) - dax_dbg("* Input address = 0x%lx but is demapped, which precludes analysis", - virtp); - else - page_size = vma_kernel_pagesize(vma); - } else if (hdr->at_src0 == CCB_AT_VA) { - page_size = DAX_SYN_LARGE_PAGE_SIZE; - } - - dax_dbg("* Input address = 0x%lx, page size = 0x%lx; page overflow %s", - virtp, page_size, - (virtp + input_size >= - ALIGN(virtp + 1, page_size)) ? - "LIKELY" : "UNLIKELY"); - dax_dbg("* Input size processed (0x%x) is %s the page bounds 0x%lx..0x%lx", - ca->n_processed, - (virtp + ca->n_processed >= - ALIGN(virtp + 1, page_size)) ? - "OUTSIDE" : "WITHIN", - virtp, ALIGN(virtp + 1, page_size)); - } +done: dax_dbg("*************************"); } diff --git a/arch/sparc/dax/dax_mm.c b/arch/sparc/dax/dax_mm.c index 890e2c4d8e52..1badae6d36e4 100644 --- a/arch/sparc/dax/dax_mm.c +++ b/arch/sparc/dax/dax_mm.c @@ -35,6 +35,11 @@ static int dax_alloc_ram(struct file *filp, struct vm_area_struct *vma) int ret = -ENOMEM; struct dax_ctx *dax_ctx = (struct dax_ctx *) filp->private_data; + if (dax_no_ra_pgsz) { + ret = -ENODEV; + goto done; + } + len = vma->vm_end - vma->vm_start; if (len & (PAGE_SIZE - 1)) { dax_err("request (0x%lx) not a multiple of page size", len); @@ -158,8 +163,7 @@ int dax_devmap(struct file *f, struct vm_area_struct *vma) return 0; } -int dax_map_segment_common(unsigned long size, - u32 *ccb_addr_type, char *name, +int dax_map_segment_common(u32 *ccb_addr_type, char *name, u32 addr_sel, union ccb *ccbp, struct dax_ctx *dax_ctx) { @@ -167,7 +171,7 @@ int dax_map_segment_common(unsigned long size, struct vm_area_struct *vma; unsigned long virtp = ccbp->dwords[addr_sel]; - dax_map_dbg("%s uva 0x%lx, size=0x%lx", name, virtp, size); + dax_map_dbg("%s uva 0x%lx", name, virtp); vma = find_vma(dax_ctx->dax_mm->this_mm, virtp); if (vma == NULL) @@ -179,24 +183,12 @@ int dax_map_segment_common(unsigned long size, if (dv == NULL || vma->vm_ops != &dax_vm_ops) return -1; - /* - * check if user provided size is within the vma bounds. - */ - if ((virtp + size) > vma->vm_end) { - dax_err("%s buffer 0x%lx+0x%lx overflows page 0x%lx+0x%lx", - name, virtp, size, dv->pa, dv->length); - return -1; - } - dax_vm_print("matched", dv); if (dax_no_flow_ctl) { - *ccb_addr_type = CCB_AT_VA; - ccbp->dwords[addr_sel] = (unsigned long)dv->kva + - (virtp - vma->vm_start); - /* touch va to fault translation into tlb/tsb */ - READ_ONCE(*(u8 *)ccbp->dwords[addr_sel]); - - dax_map_dbg("changed %s to KVA 0x%llx", name, + *ccb_addr_type = CCB_AT_RA; + ccbp->dwords[addr_sel] = CHECK_4MB_PAGE_RANGE | + (dv->pa + (virtp - vma->vm_start)); + dax_map_dbg("changed %s to RA 0x%llx", name, ccbp->dwords[addr_sel]); } else { *ccb_addr_type = CCB_AT_RA; @@ -217,8 +209,6 @@ void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len) { int i; int nelem = CCB_BYTE_TO_NCCB(ccb_len); - struct ccb_data_acc_ctl *access; - unsigned long size; u32 ccb_addr_type; for (i = 0; i < nelem; i++) { @@ -232,37 +222,27 @@ void dax_map_segment(struct dax_ctx *dax_ctx, union ccb *ccb, size_t ccb_len) dax_dbg("ccb[%d]=0x%p, idx=%d, at_dst=%d", i, ccbp, idx, hdr->at_dst); if (hdr->at_dst == CCB_AT_VA_ALT) { - access = (struct ccb_data_acc_ctl *) - &ccbp->dwords[QUERY_DWORD_DAC]; - /* size in bytes */ - size = DAX_OUT_SIZE_FROM_CCB(access->output_buf_sz); - - if (dax_map_segment_common(size, &ccb_addr_type, "dst", + if (dax_map_segment_common(&ccb_addr_type, "dst", QUERY_DWORD_OUTPUT, ccbp, - dax_ctx) == 0) { + dax_ctx) == 0) hdr->at_dst = ccb_addr_type; - /* enforce flow limit */ - if (hdr->at_dst == CCB_AT_RA) - access->flow_ctl = - DAX_BUF_LIMIT_FLOW_CTL; - } } if (hdr->at_src0 == CCB_AT_VA_ALT) { - if (dax_map_segment_common(0, &ccb_addr_type, "src0", - QUERY_DWORD_INPUT, ccbp, - dax_ctx) == 0) + if (dax_map_segment_common(&ccb_addr_type, "src0", + QUERY_DWORD_INPUT, ccbp, + dax_ctx) == 0) hdr->at_src0 = ccb_addr_type; } if (hdr->at_src1 == CCB_AT_VA_ALT) - if (dax_map_segment_common(0, &ccb_addr_type, "src1", + if (dax_map_segment_common(&ccb_addr_type, "src1", QUERY_DWORD_SEC_INPUT, ccbp, dax_ctx) == 0) hdr->at_src1 = ccb_addr_type; if (hdr->at_tbl == CCB_AT_VA_ALT) - if (dax_map_segment_common(0, &ccb_addr_type, "tbl", + if (dax_map_segment_common(&ccb_addr_type, "tbl", QUERY_DWORD_TBL, ccbp, dax_ctx) == 0) hdr->at_tbl = ccb_addr_type; -- 2.50.1