spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
 }
 
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+       int i, im;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (i = 0; i < dd->ipath_pioavregs; i++) {
+               u64 val, oldval;
+               /* deal with 6110 chip bug on high register #s */
+               im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+                       i ^ 1 : i;
+               val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+               /*
+                * busy out the buffers not in the kernel avail list,
+                * without changing the generation bits.
+                */
+               oldval = dd->ipath_pioavailshadow[i];
+               dd->ipath_pioavailshadow[i] = val |
+                       ((~dd->ipath_pioavailkernel[i] <<
+                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+                       0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+               if (oldval != dd->ipath_pioavailshadow[i])
+                       ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+                               i, oldval, dd->ipath_pioavailshadow[i]);
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
 /**
  * ipath_setrcvhdrsize - set the receive header size
  * @dd: the infinipath device
         */
        ipath_stats.sps_nopiobufs++;
        if (!(++dd->ipath_consec_nopiobuf % 100000)) {
-               ipath_dbg("%u pio sends with no bufavail; dmacopy: "
-                       "%llx %llx %llx %llx; shadow:  %lx %lx %lx %lx\n",
+               ipath_force_pio_avail_update(dd); /* at start */
+               ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+                       "%llx %llx %llx %llx\n"
+                       "ipath  shadow:  %lx %lx %lx %lx\n",
                        dd->ipath_consec_nopiobuf,
+                       (unsigned long)get_cycles(),
                        (unsigned long long) le64_to_cpu(dma[0]),
                        (unsigned long long) le64_to_cpu(dma[1]),
                        (unsigned long long) le64_to_cpu(dma[2]),
                 */
                if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
                    (sizeof(shadow[0]) * 4 * 4))
-                       ipath_dbg("2nd group: dmacopy: %llx %llx "
-                                 "%llx %llx; shadow: %lx %lx %lx %lx\n",
+                       ipath_dbg("2nd group: dmacopy: "
+                                 "%llx %llx %llx %llx\n"
+                                 "ipath  shadow:  %lx %lx %lx %lx\n",
                                  (unsigned long long)le64_to_cpu(dma[4]),
                                  (unsigned long long)le64_to_cpu(dma[5]),
                                  (unsigned long long)le64_to_cpu(dma[6]),
                                  (unsigned long long)le64_to_cpu(dma[7]),
-                                 shadow[4], shadow[5], shadow[6],
-                                 shadow[7]);
+                                 shadow[4], shadow[5], shadow[6], shadow[7]);
+
+               /* at end, so update likely happened */
+               ipath_reset_availshadow(dd);
        }
 }
 
                              unsigned len, int avail)
 {
        unsigned long flags;
-       unsigned end;
+       unsigned end, cnt = 0, next;
 
        /* There are two bits per send buffer (busy and generation) */
        start *= 2;
-       len *= 2;
-       end = start + len;
+       end = start + len * 2;
 
-       /* Set or clear the generation bits. */
        spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       /* Set or clear the busy bit in the shadow. */
        while (start < end) {
                if (avail) {
-                       __clear_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
-                               dd->ipath_pioavailshadow);
+                       unsigned long dma;
+                       int i, im;
+                       /*
+                        * the BUSY bit will never be set, because we disarm
+                        * the user buffers before we hand them back to the
+                        * kernel.  We do have to make sure the generation
+                        * bit is set correctly in shadow, since it could
+                        * have changed many times while allocated to user.
+                        * We can't use the bitmap functions on the full
+                        * dma array because it is always little-endian, so
+                        * we have to flip to host-order first.
+                        * BITS_PER_LONG is slightly wrong, since it's
+                        * always 64 bits per register in chip...
+                        * We only work on 64 bit kernels, so that's OK.
+                        */
+                       /* deal with 6110 chip bug on high register #s */
+                       i = start / BITS_PER_LONG;
+                       im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+                               i ^ 1 : i;
+                       __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+                               + start, dd->ipath_pioavailshadow);
+                       dma = (unsigned long) le64_to_cpu(
+                               dd->ipath_pioavailregs_dma[im]);
+                       if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                               + start) % BITS_PER_LONG, &dma))
+                               __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                                       + start, dd->ipath_pioavailshadow);
+                       else
+                               __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                                       + start, dd->ipath_pioavailshadow);
                        __set_bit(start, dd->ipath_pioavailkernel);
                } else {
                        __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
                }
                start += 2;
        }
+
+       if (dd->ipath_pioupd_thresh) {
+               end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+               next = find_first_bit(dd->ipath_pioavailkernel, end);
+               while (next < end) {
+                       cnt++;
+                       next = find_next_bit(dd->ipath_pioavailkernel, end,
+                                       next + 1);
+               }
+       }
        spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+       /*
+        * When moving buffers from kernel to user, if number assigned to
+        * the user is less than the pio update threshold, and threshold
+        * is supported (cnt was computed > 0), drop the update threshold
+        * so we update at least once per allocated number of buffers.
+        * In any case, if the kernel buffers are less than the threshold,
+        * drop the threshold.  We don't bother increasing it, having once
+        * decreased it, since it would typically just cycle back and forth.
+        * If we don't decrease below buffers in use, we can wait a long
+        * time for an update, until some other context uses PIO buffers.
+        */
+       if (!avail && len < cnt)
+               cnt = len;
+       if (cnt < dd->ipath_pioupd_thresh) {
+               dd->ipath_pioupd_thresh = cnt;
+               ipath_dbg("Decreased pio update threshold to %u\n",
+                       dd->ipath_pioupd_thresh);
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+                       << INFINIPATH_S_UPDTHRESH_SHIFT);
+               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+                       << INFINIPATH_S_UPDTHRESH_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+       }
 }
 
 /**
 
                (void *) dd->ipath_statusp -
                (void *) dd->ipath_pioavailregs_dma;
        if (!shared) {
-               kinfo->spi_piocnt = dd->ipath_pbufsport;
+               kinfo->spi_piocnt = pd->port_piocnt;
                kinfo->spi_piobufbase = (u64) pd->port_piobufs;
                kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
                        dd->ipath_ureg_align * pd->port_port;
        } else if (master) {
-               kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) +
-                                   (dd->ipath_pbufsport % subport_cnt);
+               kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+                                   (pd->port_piocnt % subport_cnt);
                /* Master's PIO buffers are after all the slave's */
                kinfo->spi_piobufbase = (u64) pd->port_piobufs +
                        dd->ipath_palign *
-                       (dd->ipath_pbufsport - kinfo->spi_piocnt);
+                       (pd->port_piocnt - kinfo->spi_piocnt);
        } else {
                unsigned slave = subport_fp(fp) - 1;
 
-               kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt;
+               kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
                kinfo->spi_piobufbase = (u64) pd->port_piobufs +
                        dd->ipath_palign * kinfo->spi_piocnt * slave;
        }
 
-       /*
-        * Set the PIO avail update threshold to no larger
-        * than the number of buffers per process. Note that
-        * we decrease it here, but won't ever increase it.
-        */
-       if (dd->ipath_pioupd_thresh &&
-           kinfo->spi_piocnt < dd->ipath_pioupd_thresh) {
-               unsigned long flags;
-
-               dd->ipath_pioupd_thresh = kinfo->spi_piocnt;
-               ipath_dbg("Decreased pio update threshold to %u\n",
-                       dd->ipath_pioupd_thresh);
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
-                       << INFINIPATH_S_UPDTHRESH_SHIFT);
-               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-                       << INFINIPATH_S_UPDTHRESH_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-
        if (shared) {
                kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
                        dd->ipath_ureg_align * pd->port_port;
        ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
        if (!pd->port_subport_cnt) {
                /* port is not shared */
-               piocnt = dd->ipath_pbufsport;
+               piocnt = pd->port_piocnt;
                piobufs = pd->port_piobufs;
        } else if (!subport_fp(fp)) {
                /* caller is the master */
-               piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) +
-                        (dd->ipath_pbufsport % pd->port_subport_cnt);
+               piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+                        (pd->port_piocnt % pd->port_subport_cnt);
                piobufs = pd->port_piobufs +
-                       dd->ipath_palign * (dd->ipath_pbufsport - piocnt);
+                       dd->ipath_palign * (pd->port_piocnt - piocnt);
        } else {
                unsigned slave = subport_fp(fp) - 1;
 
                /* caller is a slave */
-               piocnt = dd->ipath_pbufsport / pd->port_subport_cnt;
+               piocnt = pd->port_piocnt / pd->port_subport_cnt;
                piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
        }
 
                port_fp(fp) = pd;
                pd->port_pid = current->pid;
                strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
-               ipath_chg_pioavailkernel(dd,
-                       dd->ipath_pbufsport * (pd->port_port - 1),
-                       dd->ipath_pbufsport, 0);
                ipath_stats.sps_ports++;
                ret = 0;
        } else
 
        /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
 
+       /* some ports may get extra buffers, calculate that here */
+       if (pd->port_port <= dd->ipath_ports_extrabuf)
+               pd->port_piocnt = dd->ipath_pbufsport + 1;
+       else
+               pd->port_piocnt = dd->ipath_pbufsport;
+
        /* for right now, kernel piobufs are at end, so port 1 is at 0 */
+       if (pd->port_port <= dd->ipath_ports_extrabuf)
+               pd->port_pio_base = (dd->ipath_pbufsport + 1)
+                       * (pd->port_port - 1);
+       else
+               pd->port_pio_base = dd->ipath_ports_extrabuf +
+                       dd->ipath_pbufsport * (pd->port_port - 1);
        pd->port_piobufs = dd->ipath_piobufbase +
-               dd->ipath_pbufsport * (pd->port_port - 1) * dd->ipath_palign;
-       ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n",
-                  pd->port_port, pd->port_piobufs);
+               pd->port_pio_base * dd->ipath_palign;
+       ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+               " first pio %u\n", pd->port_port, pd->port_piobufs,
+               pd->port_piocnt, pd->port_pio_base);
+       ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
 
        /*
         * Now allocate the rcvhdr Q and eager TIDs; skip the TID
        }
 
        if (dd->ipath_kregbase) {
-               int i;
                /* atomically clear receive enable port and intr avail. */
                clear_bit(dd->ipath_r_portenable_shift + port,
                          &dd->ipath_rcvctrl);
                ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
                        pd->port_port, dd->ipath_dummy_hdrq_phys);
 
-               i = dd->ipath_pbufsport * (port - 1);
-               ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport);
-               ipath_chg_pioavailkernel(dd, i, dd->ipath_pbufsport, 1);
+               ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+               ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+                       pd->port_piocnt, 1);
 
                dd->ipath_f_clear_tids(dd, pd->port_port);
 
 
 
        dev_info(&dd->pcidev->dev,
                "Recovering from TXE PIO parity error\n");
-       ipath_disarm_senderrbufs(dd, 1);
+       ipath_disarm_senderrbufs(dd);
 }
 
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
        if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
                /*
-                * Parity errors in send memory are recoverable,
-                * just cancel the send (if indicated in * sendbuffererror),
-                * count the occurrence, unfreeze (if no other handled
-                * hardware error bits are set), and continue.
+                * Parity errors in send memory are recoverable by h/w
+                * just do housekeeping, exit freeze mode and continue.
                 */
                if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
                               INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
                        hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
                                     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
                                    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
-                       if (!hwerrs) {
-                               /* else leave in freeze mode */
-                               ipath_write_kreg(dd,
-                                                dd->ipath_kregs->kr_control,
-                                                dd->ipath_control);
-                               goto bail;
-                       }
                }
                if (hwerrs) {
                        /*
                        *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
                        dd->ipath_flags &= ~IPATH_INITTED;
                } else {
-                       ipath_dbg("Clearing freezemode on ignored hardware "
-                                 "error\n");
+                       ipath_dbg("Clearing freezemode on ignored or "
+                               "recovered hardware error\n");
                        ipath_clear_freeze(dd);
                }
        }
                         dd->ipath_rcvctrl);
        dd->ipath_p0_rcvegrcnt = 2048; /* always */
        if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               dd->ipath_pioreserved = 1; /* reserve a buffer */
+               dd->ipath_pioreserved = 3; /* kpiobufs used for PIO */
 }
 
 
 
 /*
  * min buffers we want to have per port, after driver
  */
-#define IPATH_MIN_USER_PORT_BUFCNT 8
+#define IPATH_MIN_USER_PORT_BUFCNT 7
 
 /*
  * Number of ports we are configured to use (to allow for more pio
 
 /*
  * Number of buffers reserved for driver (verbs and layered drivers.)
- * Reserved at end of buffer list.   Initialized based on
- * number of PIO buffers if not set via module interface.
+ * Initialized based on number of PIO buffers if not set via module interface.
  * The problem with this is that it's global, but we'll use different
- * numbers for different chip types.  So the default value is not
- * very useful.  I've redefined it for the 1.3 release so that it's
- * zero unless set by the user to something else, in which case we
- * try to respect it.
+ * numbers for different chip types.
  */
 static ushort ipath_kpiobufs;
 
                        pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
                else
                        pioavail = dd->ipath_pioavailregs_dma[i];
-               dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail) |
-                       (~dd->ipath_pioavailkernel[i] <<
-                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT);
+               /*
+                * don't need to worry about ipath_pioavailkernel here
+                * because we will call ipath_chg_pioavailkernel() later
+                * in initialization, to busy out buffers as needed
+                */
+               dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
        }
        /* can get counters, stats, etc. */
        dd->ipath_flags |= IPATH_PRESENT;
 int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 {
        int ret = 0;
-       u32 val32, kpiobufs;
+       u32 kpiobufs, defkbufs;
        u32 piobufs, uports;
        u64 val;
        struct ipath_portdata *pd;
        gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-       unsigned long flags;
 
        ret = init_housekeeping(dd, reinit);
        if (ret)
        dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
                / (sizeof(u64) * BITS_PER_BYTE / 2);
        uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
-       if (ipath_kpiobufs == 0) {
-               /* not set by user (this is default) */
-               if (piobufs > 144)
-                       kpiobufs = 32;
-               else
-                       kpiobufs = 16;
-       }
+       if (piobufs > 144)
+               defkbufs = 32 + dd->ipath_pioreserved;
        else
-               kpiobufs = ipath_kpiobufs;
+               defkbufs = 16 + dd->ipath_pioreserved;
 
-       if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) {
+       if (ipath_kpiobufs && (ipath_kpiobufs +
+               (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
                int i = (int) piobufs -
                        (int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
                if (i < 1)
                        i = 1;
                dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
                         "%d for kernel leaves too few for %d user ports "
-                        "(%d each); using %u\n", kpiobufs,
+                        "(%d each); using %u\n", ipath_kpiobufs,
                         piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
                /*
                 * shouldn't change ipath_kpiobufs, because could be
                 * different for different devices...
                 */
                kpiobufs = i;
-       }
+       } else if (ipath_kpiobufs)
+               kpiobufs = ipath_kpiobufs;
+       else
+               kpiobufs = defkbufs;
        dd->ipath_lastport_piobuf = piobufs - kpiobufs;
        dd->ipath_pbufsport =
                uports ? dd->ipath_lastport_piobuf / uports : 0;
-       val32 = dd->ipath_lastport_piobuf - (dd->ipath_pbufsport * uports);
-       if (val32 > 0) {
-               ipath_dbg("allocating %u pbufs/port leaves %u unused, "
-                         "add to kernel\n", dd->ipath_pbufsport, val32);
-               dd->ipath_lastport_piobuf -= val32;
-               kpiobufs += val32;
-               ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n",
-                         dd->ipath_pbufsport, val32);
-       }
+       /* if not an even divisor, some user ports get extra buffers */
+       dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+               (dd->ipath_pbufsport * uports);
+       if (dd->ipath_ports_extrabuf)
+               ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+                       "ports <= %u\n", dd->ipath_pbufsport,
+                       dd->ipath_ports_extrabuf);
        dd->ipath_lastpioindex = 0;
        dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
-       ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+       /* ipath_pioavailshadow initialized earlier */
        ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
                   "each for %u user ports\n", kpiobufs,
                   piobufs, dd->ipath_pbufsport, uports);
-       if (dd->ipath_pioupd_thresh) {
-               if (dd->ipath_pbufsport < dd->ipath_pioupd_thresh)
-                       dd->ipath_pioupd_thresh = dd->ipath_pbufsport;
-               if (kpiobufs < dd->ipath_pioupd_thresh)
-                       dd->ipath_pioupd_thresh = kpiobufs;
-       }
-
        ret = dd->ipath_f_early_init(dd);
        if (ret) {
                ipath_dev_err(dd, "Early initialization failure\n");
                goto done;
        }
 
-       /*
-        * Cancel any possible active sends from early driver load.
-        * Follows early_init because some chips have to initialize
-        * PIO buffers in early_init to avoid false parity errors.
-        */
-       ipath_cancel_sends(dd, 0);
-
        /*
         * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
         * done after early_init.
 
        ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
                         dd->ipath_pioavailregs_phys);
+
        /*
         * this is to detect s/w errors, which the h/w works around by
         * ignoring the low 6 bits of address, if it wasn't aligned.
                         ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
        ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
 
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
        /*
         * before error clears, since we expect serdes pll errors during
         * this, the first time after reset
        else
                enable_chip(dd, reinit);
 
+       /* after enable_chip, so pioavailshadow setup */
+       ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+       /*
+        * Cancel any possible active sends from early driver load.
+        * Follows early_init because some chips have to initialize
+        * PIO buffers in early_init to avoid false parity errors.
+        * After enable and ipath_chg_pioavailkernel so we can safely
+        * enable pioavail updates and PIOENABLE; packets are now
+        * ready to go out.
+        */
+       ipath_cancel_sends(dd, 1);
+
        if (!reinit) {
                /*
                 * Used when we close a port, for DMA already in flight
 
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
-/*
- * clear (write) a pio buffer, to clear a parity error.   This routine
- * should only be called when in freeze mode, and the buffer should be
- * canceled afterwards.
- */
-static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum)
-{
-       u32 __iomem *pbuf;
-       u32 dwcnt; /* dword count to write */
-       if (pnum < dd->ipath_piobcnt2k) {
-               pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum *
-                       dd->ipath_palign);
-               dwcnt = dd->ipath_piosize2k >> 2;
-       }
-       else {
-               pbuf = (u32 __iomem *) (dd->ipath_pio4kbase +
-                       (pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
-               dwcnt = dd->ipath_piosize4k >> 2;
-       }
-       dev_info(&dd->pcidev->dev,
-               "Rewrite PIO buffer %u, to recover from parity error\n",
-               pnum);
-
-       /* no flush required, since already in freeze */
-       writel(dwcnt + 1, pbuf);
-       while (--dwcnt)
-               writel(0, pbuf++);
-}
 
 /*
  * Called when we might have an error that is specific to a particular
  * PIO buffer, and may need to cancel that buffer, so it can be re-used.
- * If rewrite is true, and bits are set in the sendbufferror registers,
- * we'll write to the buffer, for error recovery on parity errors.
  */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite)
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
 {
        u32 piobcnt;
        unsigned long sbuf[4];
                }
 
                for (i = 0; i < piobcnt; i++)
-                       if (test_bit(i, sbuf)) {
-                               if (rewrite)
-                                       ipath_clrpiobuf(dd, i);
+                       if (test_bit(i, sbuf))
                                ipath_disarm_piobufs(dd, i, 1);
-                       }
                /* ignore armlaunch errs for a bit */
                dd->ipath_lastcancel = jiffies+3;
        }
 {
        u64 ignore_this_time = 0;
 
-       ipath_disarm_senderrbufs(dd, 0);
+       ipath_disarm_senderrbufs(dd);
        if ((errs & E_SUM_LINK_PKTERRS) &&
            !(dd->ipath_flags & IPATH_LINKACTIVE)) {
                /*
  * processes (causing armlaunch), send errors due to going into freeze mode,
  * etc., and try to avoid causing extra interrupts while doing so.
  * Forcibly update the in-memory pioavail register copies after cleanup
- * because the chip won't do it for anything changing while in freeze mode
- * (we don't want to wait for the next pio buffer state change).
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
  * Make sure that we don't lose any important interrupts by using the chip
  * feature that says that writing 0 to a bit in *clear that is set in
  * *status will cause an interrupt to be generated again (if allowed by
  */
 void ipath_clear_freeze(struct ipath_devdata *dd)
 {
-       int i, im;
-       u64 val;
-
        /* disable error interrupts, to avoid confusion */
        ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
 
        /* also disable interrupts; errormask is sometimes overwriten */
        ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
 
-       /*
-        * clear all sends, because they have may been
-        * completed by usercode while in freeze mode, and
-        * therefore would not be sent, and eventually
-        * might cause the process to run out of bufs
-        */
        ipath_cancel_sends(dd, 1);
+
+       /* clear the freeze, and be sure chip saw it */
        ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
                         dd->ipath_control);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 
-       /*
-        * ensure pio avail updates continue (because the update
-        * won't have happened from cancel_sends because we were
-        * still in freeze
-        */
+       /* force in-memory update now we are out of freeze */
        ipath_force_pio_avail_update(dd);
 
-       /*
-        * We just enabled pioavailupdate, so dma copy is almost certainly
-        * not yet right, so read the registers directly.  Similar to init
-        */
-       for (i = 0; i < dd->ipath_pioavregs; i++) {
-               /* deal with 6110 chip bug */
-               im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-                       i ^ 1 : i;
-               val = ipath_read_kreg64(dd, (0x1000 / sizeof(u64)) + im);
-               dd->ipath_pioavailregs_dma[i] = cpu_to_le64(val);
-               dd->ipath_pioavailshadow[i] = val |
-                       (~dd->ipath_pioavailkernel[i] <<
-                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT);
-       }
-
        /*
         * force new interrupt if any hwerr, error or interrupt bits are
         * still set, and clear "safe" send packet errors related to freeze
                ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
                spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
-               if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-                       handle_layer_pioavail(dd);
-               else
-                       ipath_dbg("unexpected BUFAVAIL intr\n");
+               /* always process; sdma verbs uses PIO for acks and VL15  */
+               handle_layer_pioavail(dd);
        }
 
        ret = IRQ_HANDLED;
 
        u16 port_subport_cnt;
        /* non-zero if port is being shared. */
        u16 port_subport_id;
+       /* number of pio bufs for this port (all procs, if shared) */
+       u32 port_piocnt;
+       /* first pio buffer for this port */
+       u32 port_pio_base;
        /* chip offset of PIO buffers for this port */
        u32 port_piobufs;
        /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
        u32 ipath_lastrpkts;
        /* pio bufs allocated per port */
        u32 ipath_pbufsport;
+       /* if remainder on bufs/port, ports < extrabuf get 1 extra */
+       u32 ipath_ports_extrabuf;
        u32 ipath_pioupd_thresh; /* update threshold, some chips */
        /*
         * number of ports configured as max; zero is set to number chip
 int ipath_update_eeprom_log(struct ipath_devdata *dd);
 void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
 u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_disarm_senderrbufs(struct ipath_devdata *, int);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
 void ipath_force_pio_avail_update(struct ipath_devdata *);
 void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
 
 
                wake_up(&qp->wait);
 }
 
-static void want_buffer(struct ipath_devdata *dd)
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
 {
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) {
+       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+               qp->ibqp.qp_type == IB_QPT_SMI) {
                unsigned long flags;
 
                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
        spin_lock_irqsave(&dev->pending_lock, flags);
        list_add_tail(&qp->piowait, &dev->piowait);
        spin_unlock_irqrestore(&dev->pending_lock, flags);
-       want_buffer(dev->dd);
+       want_buffer(dev->dd, qp);
        dev->n_piowait++;
 }
 
 
        ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
                         dd->ipath_sdma_head_phys);
 
-       /* Reserve all the former "kernel" piobufs */
-       n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - dd->ipath_pioreserved;
-       for (i = dd->ipath_lastport_piobuf; i < n; ++i) {
+       /*
+        * Reserve all the former "kernel" piobufs, using high number range
+        * so we get as many 4K buffers as possible
+        */
+       n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+       ipath_chg_pioavailkernel(dd, i, n - i , 0);
+       for (; i < n; ++i) {
                unsigned word = i / 64;
                unsigned bit = i & 63;
                BUG_ON(word >= 3);
                senddmabufmask[word] |= 1ULL << bit;
        }
-       ipath_chg_pioavailkernel(dd, dd->ipath_lastport_piobuf,
-               n - dd->ipath_lastport_piobuf, 0);
        ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
                         senddmabufmask[0]);
        ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,