Orabug:
25911008
Orabug:
25931417
The reported kernel panics and "other oddities" are caused by
corruption of kernel memory by DAX output. This is happening due to an
apparent change between UEK2 and UEK4, whereby the underlying h/w page
size for memory acquired via kmalloc has changed. UEK2 used 4mb pages,
which the dax driver used to limit the output from the coprocessor,
who refuses to cross "page boundaries". But in UEK4 it appears that a
more intelligent approach to memory is used, and kernel memory may be
backed by a variety of huge h/w page sizes, ie, 256mb and 2gb. This
now allows DAX to produce output up to this much larger page size,
thus going beyond the actual allocation. We do not have any way to
kmalloc memory with a certain backing page size, and we cannot feed
DAX a virtual address if we are not certain of its page size.
Recent hypervisor f/w has provided a powerful new feature: the ability
to convey page size bits along with a real address (RA). This gives us
the opportunity to avoid using the TLB/TSB as a parameter passing
mechanism and we can use this to avoid using virtual addresses at all
in a DAX ccb. We now use this mechanism to set the page size for
dax_alloc memory to 4mb even if the underlying h/w page size for the
memory is much larger. Memory allocated by the application via
conventional methods is not affected.
This HV feature is available on M7 f/w with minor number 1, so this is
used to determine if the driver can provide the memory allocation
service. If the feature is not available, DAX will still work, but all
the responsibility for memory allocation falls back to the
application.
The sudden ENOACCESS errors are a result of another hypervisor change.
Newest HV firmware has begun to enforce the privileged flag (bit 14)
in arg2 of the ccb_submit hypercall. This flag is described in the API
wiki as "CCB virtual addresses are treated as privileged" and in the
VM spec as "Virtual addresses within CCBs are translated in privileged
context". The explanation given later in the VM spec is that if a CCB
contains any virtual address whose TTE has the priv bit set
(_PAGE_P_4V), then the priv flag in the ccb_submit api must also be
set, or else the hypervisor will refuse to perform the translation,
and an ENOACCESS error will be thrown. Since privileged virtual
addresses are no longer used as a result of this very commit, this
problem simply disappears.
Signed-off-by: Rob Gardner <rob.gardner@oracle.com>
Reviewed-by: Jonathan Helman <jonathan.helman@oracle.com>
Signed-off-by: Allen Pais <allen.pais@oracle.com>
#include "ccb.h"
#include "sys_dax.h"
-extern bool dax_no_flow_ctl;
+extern bool dax_no_flow_ctl, dax_no_ra_pgsz;
extern int dax_debug;
extern atomic_t dax_alloc_counter;
extern atomic_t dax_actual_mem;
#define CCB_HDR(ccb) ((struct ccb_hdr *)(ccb))
#define IS_LONG_CCB(ccb) ((CCB_HDR(ccb))->sync_flags & CCB_SYNC_LONGCCB)
/* VM spec 36.2.1.1.8 & 36.2.1.2 / PRM 23.7.1 */
-#define NO_PAGE_RANGE_CHECK (0xfLL << 56)
+#define PAGE_CHECK_SHIFT 56
+#define NO_PAGE_RANGE 0xfLL
+#define NO_PAGE_RANGE_CHECK (NO_PAGE_RANGE << PAGE_CHECK_SHIFT)
+#define CHECK_4MB_PAGE_RANGE (_PAGE_SZ4MB_4V << PAGE_CHECK_SHIFT)
#define DAX_CCB_WAIT_USEC 100
#define DAX_CCB_WAIT_RETRIES_MAX 10000
atomic_t dax_requested_mem = ATOMIC_INIT(0);
int dax_debug;
-bool dax_no_flow_ctl;
+bool dax_no_flow_ctl, dax_no_ra_pgsz;
/* driver public entry points */
static long dax_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
DAX_MAJOR, minor);
}
+ dax_no_ra_pgsz = (DAX_MAJOR == 1) && (minor == 0);
+ dax_dbg("RA pagesize feature %spresent", dax_no_ra_pgsz ? "not " : "");
+
ret = hv_get_hwqueue_size(&max_ccbs);
if (ret != 0) {
dax_err("get_hwqueue_size failed with status=%d and max_ccbs=%ld",
dax_dbg("Flow control disabled by software, dax_alloc restricted to 4M");
dax_no_flow_ctl = true;
} else if ((dax_type == DAX1) && !dax_has_flow_ctl_numa()) {
- dax_dbg("Flow control disabled by hardware, dax_alloc restricted to 4M");
+ dax_dbg("Flow control disabled by hardware, dax_alloc (if available) restricted to 4M");
dax_no_flow_ctl = true;
} else {
dax_dbg("Flow control enabled");
ra = virt_to_phys(ccb);
- hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_QUERY_CMD, 0,
+ hv_rv = sun4v_dax_ccb_submit((void *) ra, 64, HV_DAX_CCB_VA_PRIVILEGED | HV_DAX_QUERY_CMD, 0,
&submitted_ccb_buf_len, &nomap_va);
if (hv_rv != HV_EOK) {
dax_info("failed dax submit, ret=0x%lx", hv_rv);
void dax_overflow_check(struct dax_ctx *ctx, int idx)
{
- unsigned long output_size, input_size, virtp;
- unsigned long page_size = PAGE_SIZE;
+ unsigned long virtp, page_size = PAGE_SIZE;
struct ccb_hdr *hdr;
union ccb *ccb;
- struct ccb_data_acc_ctl *access;
struct vm_area_struct *vma;
struct ccb_completion_area *ca = &ctx->ca_buf[idx];
ccb = &ctx->ccb_buf[idx];
hdr = CCB_HDR(ccb);
- access = (struct ccb_data_acc_ctl *) &ccb->dwords[QUERY_DWORD_DAC];
- output_size = access->output_buf_sz * 64 + 64;
- input_size = access->input_cnt + 1;
-
dax_dbg("*************************");
dax_dbg("*DAX Page Overflow Report:");
- dax_dbg("* Output size requested = 0x%lx, output size produced = 0x%x",
- output_size, ca->output_sz);
- dax_dbg("* Input size requested = 0x%lx, input size processed = 0x%x",
- input_size, ca->n_processed);
- dax_dbg("* User virtual address analysis:");
+ dax_dbg("* Output size produced = 0x%x", ca->output_sz);
+ dax_dbg("* Input size processed = 0x%x", ca->n_processed);
+ dax_dbg("* Address analysis:");
virtp = ccb->dwords[QUERY_DWORD_OUTPUT];
if (hdr->at_dst == CCB_AT_RA) {
- dax_dbg("* Output address = 0x%lx physical, so no overflow possible",
- virtp);
- } else {
- /* output buffer was virtual, so page overflow is possible */
- if (hdr->at_dst == CCB_AT_VA_ALT) {
- if (current->mm == NULL)
- return;
-
- vma = find_vma(current->mm, virtp);
- if (vma == NULL)
- dax_dbg("* Output address = 0x%lx but is demapped, which precludes analysis",
- virtp);
- else
- page_size = vma_kernel_pagesize(vma);
- } else if (hdr->at_dst == CCB_AT_VA) {
- page_size = DAX_SYN_LARGE_PAGE_SIZE;
+ page_size = DAX_SYN_LARGE_PAGE_SIZE;
+ } else if (hdr->at_dst == CCB_AT_VA_ALT) {
+ if (current->mm == NULL)
+ return;
+
+ vma = find_vma(current->mm, virtp);
+ if (vma == NULL) {
+ dax_dbg("* Output address = 0x%lx but is demapped, which precludes analysis",
+ virtp);
+ goto done;
+ } else {
+ page_size = vma_kernel_pagesize(vma);
}
+ }
- dax_dbg("* Output address = 0x%lx, page size = 0x%lx; page overflow %s",
- virtp, page_size,
- (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ?
- "LIKELY" : "UNLIKELY");
- dax_dbg("* Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx",
- ca->output_sz,
- (virtp + ca->output_sz >= ALIGN(virtp + 1, page_size)) ?
+ dax_dbg("* Output size produced (0x%x) is %s the page bounds 0x%lx..0x%lx",
+ ca->output_sz,
+ (virtp + ca->output_sz > ALIGN(virtp + 1, page_size)) ?
"OUTSIDE" : "WITHIN",
virtp, ALIGN(virtp + 1, page_size));
- }
- virtp = ccb->dwords[QUERY_DWORD_INPUT];
- if (hdr->at_src0 == CCB_AT_RA) {
- dax_dbg("* Input address = 0x%lx physical, so no overflow possible",
- virtp);
- } else {
- if (hdr->at_src0 == CCB_AT_VA_ALT) {
- if (current->mm == NULL)
- return;
-
- vma = find_vma(current->mm, virtp);
- if (vma == NULL)
- dax_dbg("* Input address = 0x%lx but is demapped, which precludes analysis",
- virtp);
- else
- page_size = vma_kernel_pagesize(vma);
- } else if (hdr->at_src0 == CCB_AT_VA) {
- page_size = DAX_SYN_LARGE_PAGE_SIZE;
- }
-
- dax_dbg("* Input address = 0x%lx, page size = 0x%lx; page overflow %s",
- virtp, page_size,
- (virtp + input_size >=
- ALIGN(virtp + 1, page_size)) ?
- "LIKELY" : "UNLIKELY");
- dax_dbg("* Input size processed (0x%x) is %s the page bounds 0x%lx..0x%lx",
- ca->n_processed,
- (virtp + ca->n_processed >=
- ALIGN(virtp + 1, page_size)) ?
- "OUTSIDE" : "WITHIN",
- virtp, ALIGN(virtp + 1, page_size));
- }
+done:
dax_dbg("*************************");
}
int ret = -ENOMEM;
struct dax_ctx *dax_ctx = (struct dax_ctx *) filp->private_data;
+ if (dax_no_ra_pgsz) {
+ ret = -ENODEV;
+ goto done;
+ }
+
len = vma->vm_end - vma->vm_start;
if (len & (PAGE_SIZE - 1)) {
dax_err("request (0x%lx) not a multiple of page size", len);
return 0;
}
-int dax_map_segment_common(unsigned long size,
- u32 *ccb_addr_type, char *name,
+int dax_map_segment_common(u32 *ccb_addr_type, char *name,
u32 addr_sel, union ccb *ccbp,
struct dax_ctx *dax_ctx)
{
struct vm_area_struct *vma;
unsigned long virtp = ccbp->dwords[addr_sel];
- dax_map_dbg("%s uva 0x%lx, size=0x%lx", name, virtp, size);
+ dax_map_dbg("%s uva 0x%lx", name, virtp);
vma = find_vma(dax_ctx->dax_mm->this_mm, virtp);
if (vma == NULL)
if (dv == NULL || vma->vm_ops != &dax_vm_ops)
return -1;
- /*
- * check if user provided size is within the vma bounds.
- */
- if ((virtp + size) > vma->vm_end) {
- dax_err("%s buffer 0x%lx+0x%lx overflows page 0x%lx+0x%lx",
- name, virtp, size, dv->pa, dv->length);
- return -1;
- }
-
dax_vm_print("matched", dv);
if (dax_no_flow_ctl) {
- *ccb_addr_type = CCB_AT_VA;
- ccbp->dwords[addr_sel] = (unsigned long)dv->kva +
- (virtp - vma->vm_start);
- /* touch va to fault translation into tlb/tsb */
- READ_ONCE(*(u8 *)ccbp->dwords[addr_sel]);
-
- dax_map_dbg("changed %s to KVA 0x%llx", name,
+ *ccb_addr_type = CCB_AT_RA;
+ ccbp->dwords[addr_sel] = CHECK_4MB_PAGE_RANGE |
+ (dv->pa + (virtp - vma->vm_start));
+ dax_map_dbg("changed %s to RA 0x%llx", name,
ccbp->dwords[addr_sel]);
} else {
*ccb_addr_type = CCB_AT_RA;
{
int i;
int nelem = CCB_BYTE_TO_NCCB(ccb_len);
- struct ccb_data_acc_ctl *access;
- unsigned long size;
u32 ccb_addr_type;
for (i = 0; i < nelem; i++) {
dax_dbg("ccb[%d]=0x%p, idx=%d, at_dst=%d",
i, ccbp, idx, hdr->at_dst);
if (hdr->at_dst == CCB_AT_VA_ALT) {
- access = (struct ccb_data_acc_ctl *)
- &ccbp->dwords[QUERY_DWORD_DAC];
- /* size in bytes */
- size = DAX_OUT_SIZE_FROM_CCB(access->output_buf_sz);
-
- if (dax_map_segment_common(size, &ccb_addr_type, "dst",
+ if (dax_map_segment_common(&ccb_addr_type, "dst",
QUERY_DWORD_OUTPUT, ccbp,
- dax_ctx) == 0) {
+ dax_ctx) == 0)
hdr->at_dst = ccb_addr_type;
- /* enforce flow limit */
- if (hdr->at_dst == CCB_AT_RA)
- access->flow_ctl =
- DAX_BUF_LIMIT_FLOW_CTL;
- }
}
if (hdr->at_src0 == CCB_AT_VA_ALT) {
- if (dax_map_segment_common(0, &ccb_addr_type, "src0",
- QUERY_DWORD_INPUT, ccbp,
- dax_ctx) == 0)
+ if (dax_map_segment_common(&ccb_addr_type, "src0",
+ QUERY_DWORD_INPUT, ccbp,
+ dax_ctx) == 0)
hdr->at_src0 = ccb_addr_type;
}
if (hdr->at_src1 == CCB_AT_VA_ALT)
- if (dax_map_segment_common(0, &ccb_addr_type, "src1",
+ if (dax_map_segment_common(&ccb_addr_type, "src1",
QUERY_DWORD_SEC_INPUT, ccbp,
dax_ctx) == 0)
hdr->at_src1 = ccb_addr_type;
if (hdr->at_tbl == CCB_AT_VA_ALT)
- if (dax_map_segment_common(0, &ccb_addr_type, "tbl",
+ if (dax_map_segment_common(&ccb_addr_type, "tbl",
QUERY_DWORD_TBL, ccbp,
dax_ctx) == 0)
hdr->at_tbl = ccb_addr_type;