]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
ceph: choose readdir frag based on previous readdir reply
authorYan, Zheng <zyan@redhat.com>
Mon, 24 Apr 2017 03:56:50 +0000 (11:56 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 12 Jul 2017 13:01:02 +0000 (15:01 +0200)
commit b50c2de51e611da90cf3cf04c058f7e9bbe79e93 upstream.

The dirfragtree is lazily updated, it's not always accurate. Infinite
loops happens in following circumstance.

- client send request to read frag A
- frag A has been fragmented into frag B and C. So mds fills the reply
  with contents of frag B
- client wants to read next frag C. ceph_choose_frag(frag value of C)
  return frag A.

The fix is using previous readdir reply to calculate next readdir frag
when possible.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
fs/ceph/dir.c

index 1afa111910007f648022c44e30f4b6db8f1bcef1..aca0d884de73af6e463e78e5bf5d564df39bf1ac 100644 (file)
@@ -315,7 +315,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        int i;
        int err;
-       u32 ftype;
+       unsigned frag = -1;
        struct ceph_mds_reply_info_parsed *rinfo;
 
        dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -362,7 +362,6 @@ more:
        /* do we have the correct frag content buffered? */
        if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
-               unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -373,8 +372,11 @@ more:
                }
 
                if (is_hash_order(ctx->pos)) {
-                       frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
-                                               NULL, NULL);
+                       /* fragtree isn't always accurate. choose frag
+                        * based on previous reply when possible. */
+                       if (frag == (unsigned)-1)
+                               frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                                       NULL, NULL);
                } else {
                        frag = fpos_frag(ctx->pos);
                }
@@ -497,6 +499,7 @@ more:
                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
+               u32 ftype;
 
                BUG_ON(rde->offset < ctx->pos);
 
@@ -519,15 +522,17 @@ more:
                ctx->pos++;
        }
 
+       ceph_mdsc_put_request(fi->last_readdir);
+       fi->last_readdir = NULL;
+
        if (fi->next_offset > 2) {
-               ceph_mdsc_put_request(fi->last_readdir);
-               fi->last_readdir = NULL;
+               frag = fi->frag;
                goto more;
        }
 
        /* more frags? */
        if (!ceph_frag_is_rightmost(fi->frag)) {
-               unsigned frag = ceph_frag_next(fi->frag);
+               frag = ceph_frag_next(fi->frag);
                if (is_hash_order(ctx->pos)) {
                        loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
                                                        fi->next_offset, true);