u64 search;
        u64 target;
        u64 nread = 0;
+       u64 nread_max;
        struct extent_buffer *eb;
        u32 nr;
        u32 blocksize;
        u32 nscan = 0;
 
-       if (level != 1)
+       if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
                return;
 
        if (!path->nodes[level])
 
        node = path->nodes[level];
 
+       /*
+        * Since the time between visiting leaves is much shorter than the time
+        * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+        * much IO at once (possibly random).
+        */
+       if (path->reada == READA_FORWARD_ALWAYS) {
+               if (level > 1)
+                       nread_max = node->fs_info->nodesize;
+               else
+                       nread_max = SZ_128K;
+       } else {
+               nread_max = SZ_64K;
+       }
+
        search = btrfs_node_blockptr(node, slot);
        blocksize = fs_info->nodesize;
        eb = find_extent_buffer(fs_info, search);
                        if (nr == 0)
                                break;
                        nr--;
-               } else if (path->reada == READA_FORWARD) {
+               } else if (path->reada == READA_FORWARD ||
+                          path->reada == READA_FORWARD_ALWAYS) {
                        nr++;
                        if (nr >= nritems)
                                break;
                                break;
                }
                search = btrfs_node_blockptr(node, nr);
-               if ((search <= target && target - search <= 65536) ||
+               if (path->reada == READA_FORWARD_ALWAYS ||
+                   (search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        btrfs_readahead_node_child(node, nr);
                        nread += blocksize;
                }
                nscan++;
-               if ((nread > 65536 || nscan > 32))
+               if (nread > nread_max || nscan > 32)
                        break;
        }
 }
 
        tmp = find_extent_buffer(fs_info, blocknr);
        if (tmp) {
+               if (p->reada == READA_FORWARD_ALWAYS)
+                       reada_for_search(fs_info, p, level, slot, key->objectid);
+
                /* first we do an atomic uptodate check */
                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
                        /*
 
        struct btrfs_key_ptr ptrs[];
 } __attribute__ ((__packed__));
 
+/* Read ahead values for struct btrfs_path.reada */
+enum {
+       READA_NONE,
+       READA_BACK,
+       READA_FORWARD,
+       /*
+        * Similar to READA_FORWARD but unlike it:
+        *
+        * 1) It will trigger readahead even for leaves that are not close to
+        *    each other on disk;
+        * 2) It also triggers readahead for nodes;
+        * 3) During a search, even when a node or leaf is already in memory, it
+        *    will still trigger readahead for other nodes and leaves that follow
+        *    it.
+        *
+        * This is meant to be used only when we know we are iterating over the
+        * entire tree or a very large part of it.
+        */
+       READA_FORWARD_ALWAYS,
+};
+
 /*
  * btrfs_paths remember the path taken from the root down to the leaf.
  * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-enum { READA_NONE, READA_BACK, READA_FORWARD };
 struct btrfs_path {
        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
        int slots[BTRFS_MAX_LEVEL];