* Right now, only two opertaions need to verify layout: glimpse
         * and setattr.
         */
-                            ci_verify_layout:1;
+                            ci_verify_layout:1,
+       /**
+        * file is released, restore has to to be triggered by vvp layer
+        */
+                            ci_restore_needed:1;
        /**
         * Number of pages owned by this IO. For invariant checking.
         */
 
 #define OBD_MD_MDS      (0x0000000100000000ULL) /* where an inode lives on */
 #define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
 #define OBD_MD_MEA      (0x0000000400000000ULL) /* CMD split EA  */
-
-/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
- * and it is already obsolete since 2.3 */
-/* #define OBD_MD_MDTIDX      (0x0000000800000000ULL) */
+#define OBD_MD_TSTATE      (0x0000000800000000ULL) /* transient state field */
 
 #define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
 #define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
                ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
 }
 
+/* 64 possible states */
+enum md_transient_state {
+       MS_RESTORE      = (1 << 0),     /* restore is running */
+};
+
 struct mdt_body {
        struct lu_fid  fid1;
        struct lu_fid  fid2;
        obd_time        ctime;
        __u64     blocks; /* XID, in the case of MDS_READPAGE */
        __u64     ioepoch;
-       __u64          unused1; /* was "ino" until 2.4.0 */
+       __u64          t_state; /* transient file state defined in
+                                * enum md_transient_state
+                                * was "ino" until 2.4.0 */
        __u32     fsuid;
        __u32     fsgid;
        __u32     capability;
 
        cl_io_fini(env, io);
        if (unlikely(io->ci_need_restart))
                goto again;
+       /* HSM import case: file is released, cannot be restored
+        * no need to fail except if restore registration failed
+        * with -ENODATA */
+       if (result == -ENODATA && io->ci_restore_needed &&
+           io->ci_result != -ENODATA)
+               result = 0;
        cl_env_put(env, &refcheck);
        return result;
 }
 
        cl_io_fini(env, io);
        /* If any bit been read/written (result != 0), we just return
         * short read/write instead of restart io. */
-       if (result == 0 && io->ci_need_restart) {
+       if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
                CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
                       iot == CIT_READ ? "read" : "write",
                       file->f_dentry->d_name.name, *ppos, count);
                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
        } else {
-               rc = ll_glimpse_size(inode);
+               /* In case of restore, the MDT has the right size and has
+                * already send it back without granting the layout lock,
+                * inode is up-to-date so glimpse is useless.
+                * Also to glimpse we need the layout, in case of a running
+                * restore the MDT holds the layout lock so the glimpse will
+                * block up to the end of restore (getattr will block)
+                */
+               if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
+                       rc = ll_glimpse_size(inode);
        }
        return rc;
 }
 
        return rc;
 }
+
+/**
+ *  This function send a restore request to the MDT
+ */
+int ll_layout_restore(struct inode *inode)
+{
+       struct hsm_user_request *hur;
+       int                      len, rc;
+
+       len = sizeof(struct hsm_user_request) +
+             sizeof(struct hsm_user_item);
+       OBD_ALLOC(hur, len);
+       if (hur == NULL)
+               return -ENOMEM;
+
+       hur->hur_request.hr_action = HUA_RESTORE;
+       hur->hur_request.hr_archive_id = 0;
+       hur->hur_request.hr_flags = 0;
+       memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+              sizeof(hur->hur_user_item[0].hui_fid));
+       hur->hur_user_item[0].hui_extent.length = -1;
+       hur->hur_request.hr_itemcount = 1;
+       rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
+                          len, hur, NULL);
+       OBD_FREE(hur, len);
+       return rc;
+}
 
        LLIF_SRVLOCK        = (1 << 5),
        /* File data is modified. */
        LLIF_DATA_MODIFIED      = (1 << 6),
+       /* File is being restored */
+       LLIF_FILE_RESTORING     = (1 << 7),
 };
 
 struct ll_inode_info {
 
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
+int ll_layout_restore(struct inode *inode);
 
 #endif /* LLITE_INTERNAL_H */
 
        struct ll_inode_info *lli = ll_i2info(inode);
        struct md_op_data *op_data = NULL;
        struct md_open_data *mod = NULL;
+       bool file_is_released = false;
        int rc = 0, rc1 = 0;
 
        CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, "
            (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
                op_data->op_flags = MF_EPOCH_OPEN;
 
+       /* truncate on a released file must failed with -ENODATA,
+        * so size must not be set on MDS for released file
+        * but other attributes must be set
+        */
+       if (S_ISREG(inode->i_mode)) {
+               struct lov_stripe_md *lsm;
+               __u32 gen;
+
+               ll_layout_refresh(inode, &gen);
+               lsm = ccc_inode_lsm_get(inode);
+               if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED)
+                       file_is_released = true;
+               ccc_inode_lsm_put(inode, lsm);
+       }
+
+       /* clear size attr for released file
+        * we clear the attribute send to MDT in op_data, not the original
+        * received from caller in attr which is used later to
+        * decide return code */
+       if (file_is_released && (attr->ia_valid & ATTR_SIZE))
+               op_data->op_attr.ia_valid &= ~ATTR_SIZE;
+
        rc = ll_md_setattr(dentry, op_data, &mod);
        if (rc)
                GOTO(out, rc);
 
+       /* truncate failed, others succeed */
+       if (file_is_released) {
+               if (attr->ia_valid & ATTR_SIZE)
+                       GOTO(out, rc = -ENODATA);
+               else
+                       GOTO(out, rc = 0);
+       }
+
        /* RPC to MDT is sent, cancel data modification flag */
        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
                spin_lock(&lli->lli_lock);
                LASSERT(md->oss_capa);
                ll_add_capa(inode, md->oss_capa);
        }
+
+       if (body->valid & OBD_MD_TSTATE) {
+               if (body->t_state & MS_RESTORE)
+                       lli->lli_flags |= LLIF_FILE_RESTORING;
+       }
 }
 
 void ll_read_inode2(struct inode *inode, void *opaque)
 
 
        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
 
-       CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n",
-               io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen);
+       CDEBUG(D_VFSTRACE, DFID
+              " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+              PFID(lu_object_fid(&obj->co_lu)),
+              io->ci_ignore_layout, io->ci_verify_layout,
+              cio->cui_layout_gen, io->ci_restore_needed);
+
+       if (io->ci_restore_needed == 1) {
+               int     rc;
+
+               /* file was detected release, we need to restore it
+                * before finishing the io
+                */
+               rc = ll_layout_restore(ccc_object_inode(obj));
+               /* if restore registration failed, no restart,
+                * we will return -ENODATA */
+               /* The layout will change after restore, so we need to
+                * block on layout lock hold by the MDT
+                * as MDT will not send new layout in lvb (see LU-3124)
+                * we have to explicitly fetch it, all this will be done
+                * by ll_layout_refresh()
+                */
+               if (rc == 0) {
+                       io->ci_restore_needed = 0;
+                       io->ci_need_restart = 1;
+                       io->ci_verify_layout = 1;
+               } else {
+                       io->ci_restore_needed = 1;
+                       io->ci_need_restart = 0;
+                       io->ci_verify_layout = 0;
+                       io->ci_result = rc;
+               }
+       }
 
        if (!io->ci_ignore_layout && io->ci_verify_layout) {
                __u32 gen = 0;
                /* check layout version */
                ll_layout_refresh(ccc_object_inode(obj), &gen);
                io->ci_need_restart = cio->cui_layout_gen != gen;
-               if (io->ci_need_restart)
-                       CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n",
-                               cio->cui_layout_gen, gen);
+               if (io->ci_need_restart) {
+                       CDEBUG(D_VFSTRACE,
+                              DFID" layout changed from %d to %d.\n",
+                              PFID(lu_object_fid(&obj->co_lu)),
+                              cio->cui_layout_gen, gen);
+                       /* today successful restore is the only possible
+                        * case */
+                       /* restore was done, clear restoring state */
+                       ll_i2info(ccc_object_inode(obj))->lli_flags &=
+                               ~LLIF_FILE_RESTORING;
+               }
        }
 }
 
 
        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
 
+       CDEBUG(D_VFSTRACE, DFID
+              " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+              PFID(lu_object_fid(&obj->co_lu)),
+              io->ci_ignore_layout, io->ci_verify_layout,
+              cio->cui_layout_gen, io->ci_restore_needed);
+
        CL_IO_SLICE_CLEAN(cio, cui_cl);
        cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
        vio->cui_ra_window_set = 0;
 
                LASSERTF(0, "invalid type %d\n", io->ci_type);
        case CIT_MISC:
        case CIT_FSYNC:
-               result = +1;
+               result = 1;
                break;
        case CIT_SETATTR:
+               /* the truncate to 0 is managed by MDT:
+                * - in open, for open O_TRUNC
+                * - in setattr, for truncate
+                */
+               /* the truncate is for size > 0 so triggers a restore */
+               if (cl_io_is_trunc(io))
+                       io->ci_restore_needed = 1;
+               result = -ENODATA;
+               break;
        case CIT_READ:
        case CIT_WRITE:
        case CIT_FAULT:
-               /* TODO: need to restore the file. */
-               result = -EBADF;
+               io->ci_restore_needed = 1;
+               result = -ENODATA;
                break;
        }
        if (result == 0) {
 
        __swab64s(&b->ctime);
        __swab64s(&b->blocks);
        __swab64s(&b->ioepoch);
-       CLASSERT(offsetof(typeof(*b), unused1) != 0);
+       __swab64s(&b->t_state);
        __swab32s(&b->fsuid);
        __swab32s(&b->fsgid);
        __swab32s(&b->capability);
 
 {
         /* Wire protocol assertions generated by 'wirecheck'
          * (make -C lustre/utils newwiretest)
-         * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x
-         * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC)  */
-
+         * running on Linux centos6-bis 2.6.32-358.0.1.el6-head
+         * #3 SMP Wed Apr 17 17:37:43 CEST 2013
+         * with gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC)
+         */
 
        /* Constants... */
        LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
                 OBD_MD_REINT);
        LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
                 OBD_MD_MEA);
+       LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL),
+                "found 0x%.16llxULL\n", OBD_MD_TSTATE);
        LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
                 OBD_MD_FLXATTR);
        LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
                 (long long)(int)offsetof(struct mdt_body, blocks));
        LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
                 (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
-       LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n",
-                (long long)(int)offsetof(struct mdt_body, unused1));
-       LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n",
-                (long long)(int)sizeof(((struct mdt_body *)0)->unused1));
+       LASSERTF((int)offsetof(struct mdt_body, t_state) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, t_state));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->t_state) == 8,
+                "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->t_state));
        LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
                 (long long)(int)offsetof(struct mdt_body, fsuid));
        LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",