{
        struct inode            *inode = VFS_I(ip);
 
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
 
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
 
        /* inode is valid */
        return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
 }
 
 STATIC int
                int             error = 0;
                int             i;
 
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                        break;
                }
 
                                batch[i] = NULL;
 
                        /*
-                        * Update the index for the next lookup. Catch overflows
-                        * into the next AG range which can occur if we have inodes
-                        * in the last block of the AG and we are currently
-                        * pointing to the last inode.
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
                         */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
 
                /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
        struct xfs_inode        *ip,
        int                     flags)
 {
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
 
        /*
-        * do some unlocked checks first to avoid unnecceary lock traffic.
+        * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-       ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* ignore as it is already under reclaim */
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
 
-                       write_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                               write_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                break;
                        }
 
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
                                 */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
 
                        /* unlock now we've grabbed the inodes. */
-                       write_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
 
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+       ASSERT(ip->i_ino == 0);
 
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
        ip->i_size = 0;
        ip->i_new_size = 0;
 
-       /* prevent anyone from using this yet */
-       VFS_I(ip)->i_state = I_NEW;
-
        return ip;
 }
 
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
 
+       /*
+        * Because we use RCU freeing we need to ensure the inode always
+        * appears to be reclaimed with an invalid inode number when in the
+        * free state. The ip->i_flags_lock provides the barrier against lookup
+        * races.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
        call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
 }
 
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+       xfs_ino_t               ino,
        int                     flags,
-       int                     lock_flags) __releases(pag->pag_ici_lock)
+       int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
 
+       /*
+        * check for re-use of an inode within an RCU grace period due to the
+        * radix tree nodes not being updated yet. We monitor for this by
+        * setting the inode number to zero before freeing the inode structure.
+        * If the inode has been reallocated and set up, then the inode number
+        * will not match, so check for that, too.
+        */
        spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino != ino) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               error = EAGAIN;
+               goto out_error;
+       }
+
 
        /*
         * If we are racing with another cache hit that is currently
                ip->i_flags |= XFS_IRECLAIM;
 
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
 
                        ip->i_flags &= ~XFS_INEW;
 
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
 
 
 out_error:
        spin_unlock(&ip->i_flags_lock);
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        return error;
 }
 
        xfs_agino_t     agino;
 
        /* reject inode numbers outside existing AGs */
-       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+       if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
 
        /* get the perag structure and ensure that it's inode capable */
 
 again:
        error = 0;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
        if (ip) {
-               error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+               error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
 
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
 
-                       /* Inode not in memory or stale, nothing to do */
-                       if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
-                               read_unlock(&pag->pag_ici_lock);
+                       /* Inode not in memory, nothing to do */
+                       if (!ip) {
+                               rcu_read_unlock();
                                continue;
                        }
 
+                       /*
+                        * because this is an RCU protected lookup, we could
+                        * find a recently freed or even reallocated inode
+                        * during the lookup. We need to check under the
+                        * i_flags_lock for a valid inode here. Skip it if it
+                        * is not valid, the wrong inode or stale.
+                        */
+                       spin_lock(&ip->i_flags_lock);
+                       if (ip->i_ino != inum + i ||
+                           __xfs_iflags_test(ip, XFS_ISTALE)) {
+                               spin_unlock(&ip->i_flags_lock);
+                               rcu_read_unlock();
+                               continue;
+                       }
+                       spin_unlock(&ip->i_flags_lock);
+
                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                               read_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
 
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
                iq = ilist[i];
                if (iq == ip)
                        continue;
-               /* if the inode lies outside this cluster, we're done. */
-               if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
-                       break;
+
+               /*
+                * because this is an RCU protected lookup, we could find a
+                * recently freed or even reallocated inode during the lookup.
+                * We need to check under the i_flags_lock for a valid inode
+                * here. Skip it if it is not valid or the wrong inode.
+                */
+               spin_lock(&ip->i_flags_lock);
+               if (!ip->i_ino ||
+                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
+
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
        }
 
 out_free:
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the