TRACE_EVENT(iwlwifi_dev_hcmd,
        TP_PROTO(const struct device *dev,
                 struct iwl_host_cmd *cmd, u16 total_size,
-                const void *hdr, size_t hdr_len),
-       TP_ARGS(dev, cmd, total_size, hdr, hdr_len),
+                struct iwl_cmd_header *hdr),
+       TP_ARGS(dev, cmd, total_size, hdr),
        TP_STRUCT__entry(
                DEV_ENTRY
                __dynamic_array(u8, hcmd, total_size)
                __field(u32, flags)
        ),
        TP_fast_assign(
-               int i, offset = hdr_len;
+               int i, offset = sizeof(*hdr);
 
                DEV_ASSIGN;
                __entry->flags = cmd->flags;
-               memcpy(__get_dynamic_array(hcmd), hdr, hdr_len);
+               memcpy(__get_dynamic_array(hcmd), hdr, sizeof(*hdr));
 
                for (i = 0; i < IWL_MAX_CMD_TFDS; i++) {
                        if (!cmd->len[i])
                                continue;
-                       if (!(cmd->dataflags[i] & IWL_HCMD_DFL_NOCOPY))
-                               continue;
                        memcpy((u8 *)__get_dynamic_array(hcmd) + offset,
                               cmd->data[i], cmd->len[i]);
                        offset += cmd->len[i];
 
        void *dup_buf = NULL;
        dma_addr_t phys_addr;
        int idx;
-       u16 copy_size, cmd_size;
+       u16 copy_size, cmd_size, dma_size;
        bool had_nocopy = false;
        int i;
        u32 cmd_pos;
+       const u8 *cmddata[IWL_MAX_CMD_TFDS];
+       u16 cmdlen[IWL_MAX_CMD_TFDS];
 
        copy_size = sizeof(out_cmd->hdr);
        cmd_size = sizeof(out_cmd->hdr);
        BUILD_BUG_ON(IWL_MAX_CMD_TFDS > IWL_NUM_OF_TBS - 1);
 
        for (i = 0; i < IWL_MAX_CMD_TFDS; i++) {
+               cmddata[i] = cmd->data[i];
+               cmdlen[i] = cmd->len[i];
+
                if (!cmd->len[i])
                        continue;
+
+               /* need at least IWL_HCMD_MIN_COPY_SIZE copied */
+               if (copy_size < IWL_HCMD_MIN_COPY_SIZE) {
+                       int copy = IWL_HCMD_MIN_COPY_SIZE - copy_size;
+
+                       if (copy > cmdlen[i])
+                               copy = cmdlen[i];
+                       cmdlen[i] -= copy;
+                       cmddata[i] += copy;
+                       copy_size += copy;
+               }
+
                if (cmd->dataflags[i] & IWL_HCMD_DFL_NOCOPY) {
                        had_nocopy = true;
                        if (WARN_ON(cmd->dataflags[i] & IWL_HCMD_DFL_DUP)) {
                                goto free_dup_buf;
                        }
 
-                       dup_buf = kmemdup(cmd->data[i], cmd->len[i],
+                       dup_buf = kmemdup(cmddata[i], cmdlen[i],
                                          GFP_ATOMIC);
                        if (!dup_buf)
                                return -ENOMEM;
                                idx = -EINVAL;
                                goto free_dup_buf;
                        }
-                       copy_size += cmd->len[i];
+                       copy_size += cmdlen[i];
                }
                cmd_size += cmd->len[i];
        }
 
        /* and copy the data that needs to be copied */
        cmd_pos = offsetof(struct iwl_device_cmd, payload);
+       copy_size = sizeof(out_cmd->hdr);
        for (i = 0; i < IWL_MAX_CMD_TFDS; i++) {
-               if (!cmd->len[i])
+               int copy = 0;
+
+               if (!cmd->len)
                        continue;
-               if (cmd->dataflags[i] & (IWL_HCMD_DFL_NOCOPY |
-                                        IWL_HCMD_DFL_DUP))
-                       break;
-               memcpy((u8 *)out_cmd + cmd_pos, cmd->data[i], cmd->len[i]);
-               cmd_pos += cmd->len[i];
+
+               /* need at least IWL_HCMD_MIN_COPY_SIZE copied */
+               if (copy_size < IWL_HCMD_MIN_COPY_SIZE) {
+                       copy = IWL_HCMD_MIN_COPY_SIZE - copy_size;
+
+                       if (copy > cmd->len[i])
+                               copy = cmd->len[i];
+               }
+
+               /* copy everything if not nocopy/dup */
+               if (!(cmd->dataflags[i] & (IWL_HCMD_DFL_NOCOPY |
+                                          IWL_HCMD_DFL_DUP)))
+                       copy = cmd->len[i];
+
+               if (copy) {
+                       memcpy((u8 *)out_cmd + cmd_pos, cmd->data[i], copy);
+                       cmd_pos += copy;
+                       copy_size += copy;
+               }
        }
 
        WARN_ON_ONCE(txq->entries[idx].copy_cmd);
                     out_cmd->hdr.cmd, le16_to_cpu(out_cmd->hdr.sequence),
                     cmd_size, q->write_ptr, idx, trans_pcie->cmd_queue);
 
-       phys_addr = dma_map_single(trans->dev, &out_cmd->hdr, copy_size,
+       /*
+        * If the entire command is smaller than IWL_HCMD_MIN_COPY_SIZE, we must
+        * still map at least that many bytes for the hardware to write back to.
+        * We have enough space, so that's not a problem.
+        */
+       dma_size = max_t(u16, copy_size, IWL_HCMD_MIN_COPY_SIZE);
+
+       phys_addr = dma_map_single(trans->dev, &out_cmd->hdr, dma_size,
                                   DMA_BIDIRECTIONAL);
        if (unlikely(dma_mapping_error(trans->dev, phys_addr))) {
                idx = -ENOMEM;
        }
 
        dma_unmap_addr_set(out_meta, mapping, phys_addr);
-       dma_unmap_len_set(out_meta, len, copy_size);
+       dma_unmap_len_set(out_meta, len, dma_size);
 
        iwl_pcie_txq_build_tfd(trans, txq, phys_addr, copy_size, 1);
 
+       /* map the remaining (adjusted) nocopy/dup fragments */
        for (i = 0; i < IWL_MAX_CMD_TFDS; i++) {
-               const void *data = cmd->data[i];
+               const void *data = cmddata[i];
 
-               if (!cmd->len[i])
+               if (!cmdlen[i])
                        continue;
                if (!(cmd->dataflags[i] & (IWL_HCMD_DFL_NOCOPY |
                                           IWL_HCMD_DFL_DUP)))
                if (cmd->dataflags[i] & IWL_HCMD_DFL_DUP)
                        data = dup_buf;
                phys_addr = dma_map_single(trans->dev, (void *)data,
-                                          cmd->len[i], DMA_BIDIRECTIONAL);
+                                          cmdlen[i], DMA_BIDIRECTIONAL);
                if (dma_mapping_error(trans->dev, phys_addr)) {
                        iwl_pcie_tfd_unmap(trans, out_meta,
                                           &txq->tfds[q->write_ptr],
                        goto out;
                }
 
-               iwl_pcie_txq_build_tfd(trans, txq, phys_addr, cmd->len[i], 0);
+               iwl_pcie_txq_build_tfd(trans, txq, phys_addr, cmdlen[i], 0);
        }
 
        out_meta->flags = cmd->flags;
 
        txq->need_update = 1;
 
-       trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size,
-                              &out_cmd->hdr, copy_size);
+       trace_iwlwifi_dev_hcmd(trans->dev, cmd, cmd_size, &out_cmd->hdr);
 
        /* start timer if queue currently empty */
        if (q->read_ptr == q->write_ptr && trans_pcie->wd_timeout)