Yeah, it's about time.
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
         * Take appropriate lock on inode
         */
        if (create)
-               IWRITE_LOCK(ip);
+               IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
        else
-               IREAD_LOCK(ip);
+               IREAD_LOCK(ip, RDWRLOCK_NORMAL);
 
        if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
            (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
 
        nobh_truncate_page(ip->i_mapping, ip->i_size);
 
-       IWRITE_LOCK(ip);
+       IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
        jfs_truncate_nolock(ip, ip->i_size);
        IWRITE_UNLOCK(ip);
 }
 
        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 
-       IREAD_LOCK(ipbmap);
+       IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
        /* block to be freed better be within the mapsize. */
        if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
         * allocation group size, try to allocate anywhere.
         */
        if (l2nb > bmp->db_agl2size) {
-               IWRITE_LOCK(ipbmap);
+               IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 
                rc = dbAllocAny(bmp, nblocks, l2nb, results);
 
         * the hint using a tiered strategy.
         */
        if (nblocks <= BPERDMAP) {
-               IREAD_LOCK(ipbmap);
+               IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
                /* get the buffer for the dmap containing the hint.
                 */
        /* try to satisfy the allocation request with blocks within
         * the same allocation group as the hint.
         */
-       IWRITE_LOCK(ipbmap);
+       IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
        if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
                goto write_unlock;
 
         * Let dbNextAG recommend a preferred allocation group
         */
        agno = dbNextAG(ipbmap);
-       IWRITE_LOCK(ipbmap);
+       IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 
        /* Try to allocate within this allocation group.  if that fails, try to
         * allocate anywhere in the map.
        s64 lblkno;
        struct metapage *mp;
 
-       IREAD_LOCK(ipbmap);
+       IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
        /*
         * validate extent request:
         */
        extblkno = lastblkno + 1;
 
-       IREAD_LOCK(ipbmap);
+       IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
        /* better be within the file system */
        bmp = sbi->bmap;
        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 
-       IREAD_LOCK(ipbmap);
+       IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
        /* block to be allocated better be within the mapsize. */
        ASSERT(nblocks <= bmp->db_mapsize - blkno);
 
 
        /* read the iag */
        imap = JFS_IP(ipimap)->i_imap;
-       IREAD_LOCK(ipimap);
+       IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        rc = diIAGRead(imap, iagno, &mp);
        IREAD_UNLOCK(ipimap);
        if (rc) {
        /* Obtain read lock in imap inode.  Don't release it until we have
         * read all of the IAG's that we are going to.
         */
-       IREAD_LOCK(ipimap);
+       IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
        /* read the iag.
         */
        AG_LOCK(imap, agno);
 
        /* Get read lock on imap inode */
-       IREAD_LOCK(ipimap);
+       IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
        /* get the iag number and read the iag */
        iagno = INOTOIAG(inum);
                return -ENOSPC;
 
        /* obtain read lock on imap inode */
-       IREAD_LOCK(imap->im_ipimap);
+       IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
 
        /* read the iag at the head of the list.
         */
        } else {
                /* read the iag.
                 */
-               IREAD_LOCK(imap->im_ipimap);
+               IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
                if ((rc = diIAGRead(imap, iagno, &mp))) {
                        IREAD_UNLOCK(imap->im_ipimap);
                        jfs_error(ip->i_sb, "diAllocExt: error reading iag");
                 */
 
                /* acquire inode map lock */
-               IWRITE_LOCK(ipimap);
+               IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
 
                if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
                        IWRITE_UNLOCK(ipimap);
        }
 
        /* obtain read lock on map */
-       IREAD_LOCK(ipimap);
+       IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
        /* read the iag */
        if ((rc = diIAGRead(imap, iagno, &mp))) {
                return -EIO;
        }
        /* read the iag */
-       IREAD_LOCK(ipimap);
+       IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        rc = diIAGRead(imap, iagno, &mp);
        IREAD_UNLOCK(ipimap);
        if (rc)
 
 
 #define JFS_ACL_NOT_CACHED ((void *)-1)
 
-#define IREAD_LOCK(ip)         down_read(&JFS_IP(ip)->rdwrlock)
+#define IREAD_LOCK(ip, subclass) \
+       down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IREAD_UNLOCK(ip)       up_read(&JFS_IP(ip)->rdwrlock)
-#define IWRITE_LOCK(ip)                down_write(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip, subclass) \
+       down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IWRITE_UNLOCK(ip)      up_write(&JFS_IP(ip)->rdwrlock)
 
 /*
        COMMIT_Synclist,        /* metadata pages on group commit synclist */
 };
 
+/*
+ * commit_mutex nesting subclasses:
+ */
+enum commit_mutex_class
+{
+       COMMIT_MUTEX_PARENT,
+       COMMIT_MUTEX_CHILD,
+       COMMIT_MUTEX_SECOND_PARENT,     /* Renaming */
+       COMMIT_MUTEX_VICTIM             /* Inode being unlinked due to rename */
+};
+
+/*
+ * rdwrlock subclasses:
+ * The dmap inode may be locked while a normal inode or the imap inode are
+ * locked.
+ */
+enum rdwrlock_class
+{
+       RDWRLOCK_NORMAL,
+       RDWRLOCK_IMAP,
+       RDWRLOCK_DMAP
+};
+
 #define set_cflag(flag, ip)    set_bit(flag, &(JFS_IP(ip)->cflag))
 #define clear_cflag(flag, ip)  clear_bit(flag, &(JFS_IP(ip)->cflag))
 #define test_cflag(flag, ip)   test_bit(flag, &(JFS_IP(ip)->cflag))
 
 
        tid = txBegin(dip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dip)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        rc = jfs_init_acl(tid, ip, dip);
        if (rc)
 
        tid = txBegin(dip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dip)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        rc = jfs_init_acl(tid, ip, dip);
        if (rc)
 
        tid = txBegin(dip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dip)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        iplist[0] = dip;
        iplist[1] = ip;
        if ((rc = get_UCSname(&dname, dentry)))
                goto out;
 
-       IWRITE_LOCK(ip);
+       IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
 
        tid = txBegin(dip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dip)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        iplist[0] = dip;
        iplist[1] = ip;
 
        tid = txBegin(ip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dir)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        /*
         * scan parent directory for entry/freespace
 
        tid = txBegin(dip->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dip)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        rc = jfs_init_security(tid, ip, dip);
        if (rc)
                        goto out3;
                }
        } else if (new_ip) {
-               IWRITE_LOCK(new_ip);
+               IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
                /* Init inode for quota operations. */
                DQUOT_INIT(new_ip);
        }
         */
        tid = txBegin(new_dir->i_sb, 0);
 
-       mutex_lock(&JFS_IP(new_dir)->commit_mutex);
-       mutex_lock(&JFS_IP(old_ip)->commit_mutex);
+       /*
+        * How do we know the locking is safe from deadlocks?
+        * The vfs does the hard part for us.  Any time we are taking nested
+        * commit_mutexes, the vfs already has i_mutex held on the parent.
+        * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
+        */
+       mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        if (old_dir != new_dir)
-               mutex_lock(&JFS_IP(old_dir)->commit_mutex);
+               mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
+                                 COMMIT_MUTEX_SECOND_PARENT);
 
        if (new_ip) {
-               mutex_lock(&JFS_IP(new_ip)->commit_mutex);
+               mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
+                                 COMMIT_MUTEX_VICTIM);
                /*
                 * Change existing directory entry to new inode number
                 */
 
        tid = txBegin(dir->i_sb, 0);
 
-       mutex_lock(&JFS_IP(dir)->commit_mutex);
-       mutex_lock(&JFS_IP(ip)->commit_mutex);
+       mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+       mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
        rc = jfs_init_acl(tid, ip, dir);
        if (rc)