www.infradead.org Git - users/dwmw2/linux.git/blob

1 // SPDX-License-Identifier: GPL-2.0

2 #include <linux/ceph/ceph_debug.h>

3

4 #include <linux/fs.h>

5 #include <linux/wait.h>

6 #include <linux/slab.h>

7 #include <linux/gfp.h>

8 #include <linux/sched.h>

9 #include <linux/debugfs.h>

10 #include <linux/seq_file.h>

11 #include <linux/ratelimit.h>

12 #include <linux/bits.h>

13 #include <linux/ktime.h>

14 #include <linux/bitmap.h>

15 #include <linux/mnt_idmapping.h>

16

17 #include "super.h"

18 #include "mds_client.h"

19 #include "crypto.h"

20

21 #include <linux/ceph/ceph_features.h>

22 #include <linux/ceph/messenger.h>

23 #include <linux/ceph/decode.h>

24 #include <linux/ceph/pagelist.h>

25 #include <linux/ceph/auth.h>

26 #include <linux/ceph/debugfs.h>

27

28 #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)

29

30 /*

31 * A cluster of MDS (metadata server) daemons is responsible for

32 * managing the file system namespace (the directory hierarchy and

33 * inodes) and for coordinating shared access to storage. Metadata is

34 * partitioning hierarchically across a number of servers, and that

35 * partition varies over time as the cluster adjusts the distribution

36 * in order to balance load.

37 *

38 * The MDS client is primarily responsible to managing synchronous

39 * metadata requests for operations like open, unlink, and so forth.

40 * If there is a MDS failure, we find out about it when we (possibly

41 * request and) receive a new MDS map, and can resubmit affected

42 * requests.

43 *

44 * For the most part, though, we take advantage of a lossless

45 * communications channel to the MDS, and do not need to worry about

46 * timing out or resubmitting requests.

47 *

48 * We maintain a stateful "session" with each MDS we interact with.

49 * Within each session, we sent periodic heartbeat messages to ensure

50 * any capabilities or leases we have been issues remain valid. If

51 * the session times out and goes stale, our leases and capabilities

52 * are no longer valid.

53 */

54

55 struct ceph_reconnect_state {

56 struct ceph_mds_session *session;

57 int nr_caps, nr_realms;

58 struct ceph_pagelist *pagelist;

59 unsigned msg_version;

60 bool allow_multi;

61 };

62

63 static void __wake_requests(struct ceph_mds_client *mdsc,

64 struct list_head *head);

65 static void ceph_cap_release_work(struct work_struct *work);

66 static void ceph_cap_reclaim_work(struct work_struct *work);

67

68 static const struct ceph_connection_operations mds_con_ops;

69

70

71 /*

72 * mds reply parsing

73 */

74

75 static int parse_reply_info_quota(void **p, void *end,

76 struct ceph_mds_reply_info_in *info)

77 {

78 u8 struct_v, struct_compat;

79 u32 struct_len;

80

81 ceph_decode_8_safe(p, end, struct_v, bad);

82 ceph_decode_8_safe(p, end, struct_compat, bad);

83 /* struct_v is expected to be >= 1. we only

84 * understand encoding with struct_compat == 1. */

85 if (!struct_v || struct_compat != 1)

86 goto bad;

87 ceph_decode_32_safe(p, end, struct_len, bad);

88 ceph_decode_need(p, end, struct_len, bad);

89 end = *p + struct_len;

90 ceph_decode_64_safe(p, end, info->max_bytes, bad);

91 ceph_decode_64_safe(p, end, info->max_files, bad);

92 *p = end;

93 return 0;

94 bad:

95 return -EIO;

96 }

97

98 /*

99 * parse individual inode info

100 */

101 static int parse_reply_info_in(void **p, void *end,

102 struct ceph_mds_reply_info_in *info,

103 u64 features)

104 {

105 int err = 0;

106 u8 struct_v = 0;

107

108 if (features == (u64)-1) {

109 u32 struct_len;

110 u8 struct_compat;

111 ceph_decode_8_safe(p, end, struct_v, bad);

112 ceph_decode_8_safe(p, end, struct_compat, bad);

113 /* struct_v is expected to be >= 1. we only understand

114 * encoding with struct_compat == 1. */

115 if (!struct_v || struct_compat != 1)

116 goto bad;

117 ceph_decode_32_safe(p, end, struct_len, bad);

118 ceph_decode_need(p, end, struct_len, bad);

119 end = *p + struct_len;

120 }

121

122 ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);

123 info->in = *p;

124 *p += sizeof(struct ceph_mds_reply_inode) +

125 sizeof(*info->in->fragtree.splits) *

126 le32_to_cpu(info->in->fragtree.nsplits);

127

128 ceph_decode_32_safe(p, end, info->symlink_len, bad);

129 ceph_decode_need(p, end, info->symlink_len, bad);

130 info->symlink = *p;

131 *p += info->symlink_len;

132

133 ceph_decode_copy_safe(p, end, &info->dir_layout,

134 sizeof(info->dir_layout), bad);

135 ceph_decode_32_safe(p, end, info->xattr_len, bad);

136 ceph_decode_need(p, end, info->xattr_len, bad);

137 info->xattr_data = *p;

138 *p += info->xattr_len;

139

140 if (features == (u64)-1) {

141 /* inline data */

142 ceph_decode_64_safe(p, end, info->inline_version, bad);

143 ceph_decode_32_safe(p, end, info->inline_len, bad);

144 ceph_decode_need(p, end, info->inline_len, bad);

145 info->inline_data = *p;

146 *p += info->inline_len;

147 /* quota */

148 err = parse_reply_info_quota(p, end, info);

149 if (err < 0)

150 goto out_bad;

151 /* pool namespace */

152 ceph_decode_32_safe(p, end, info->pool_ns_len, bad);

153 if (info->pool_ns_len > 0) {

154 ceph_decode_need(p, end, info->pool_ns_len, bad);

155 info->pool_ns_data = *p;

156 *p += info->pool_ns_len;

157 }

158

159 /* btime */

160 ceph_decode_need(p, end, sizeof(info->btime), bad);

161 ceph_decode_copy(p, &info->btime, sizeof(info->btime));

162

163 /* change attribute */

164 ceph_decode_64_safe(p, end, info->change_attr, bad);

165

166 /* dir pin */

167 if (struct_v >= 2) {

168 ceph_decode_32_safe(p, end, info->dir_pin, bad);

169 } else {

170 info->dir_pin = -ENODATA;

171 }

172

173 /* snapshot birth time, remains zero for v<=2 */

174 if (struct_v >= 3) {

175 ceph_decode_need(p, end, sizeof(info->snap_btime), bad);

176 ceph_decode_copy(p, &info->snap_btime,

177 sizeof(info->snap_btime));

178 } else {

179 memset(&info->snap_btime, 0, sizeof(info->snap_btime));

180 }

181

182 /* snapshot count, remains zero for v<=3 */

183 if (struct_v >= 4) {

184 ceph_decode_64_safe(p, end, info->rsnaps, bad);

185 } else {

186 info->rsnaps = 0;

187 }

188

189 if (struct_v >= 5) {

190 u32 alen;

191

192 ceph_decode_32_safe(p, end, alen, bad);

193

194 while (alen--) {

195 u32 len;

196

197 /* key */

198 ceph_decode_32_safe(p, end, len, bad);

199 ceph_decode_skip_n(p, end, len, bad);

200 /* value */

201 ceph_decode_32_safe(p, end, len, bad);

202 ceph_decode_skip_n(p, end, len, bad);

203 }

204 }

205

206 /* fscrypt flag -- ignore */

207 if (struct_v >= 6)

208 ceph_decode_skip_8(p, end, bad);

209

210 info->fscrypt_auth = NULL;

211 info->fscrypt_auth_len = 0;

212 info->fscrypt_file = NULL;

213 info->fscrypt_file_len = 0;

214 if (struct_v >= 7) {

215 ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);

216 if (info->fscrypt_auth_len) {

217 info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,

218 GFP_KERNEL);

219 if (!info->fscrypt_auth)

220 return -ENOMEM;

221 ceph_decode_copy_safe(p, end, info->fscrypt_auth,

222 info->fscrypt_auth_len, bad);

223 }

224 ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);

225 if (info->fscrypt_file_len) {

226 info->fscrypt_file = kmalloc(info->fscrypt_file_len,

227 GFP_KERNEL);

228 if (!info->fscrypt_file)

229 return -ENOMEM;

230 ceph_decode_copy_safe(p, end, info->fscrypt_file,

231 info->fscrypt_file_len, bad);

232 }

233 }

234 *p = end;

235 } else {

236 /* legacy (unversioned) struct */

237 if (features & CEPH_FEATURE_MDS_INLINE_DATA) {

238 ceph_decode_64_safe(p, end, info->inline_version, bad);

239 ceph_decode_32_safe(p, end, info->inline_len, bad);

240 ceph_decode_need(p, end, info->inline_len, bad);

241 info->inline_data = *p;

242 *p += info->inline_len;

243 } else

244 info->inline_version = CEPH_INLINE_NONE;

245

246 if (features & CEPH_FEATURE_MDS_QUOTA) {

247 err = parse_reply_info_quota(p, end, info);

248 if (err < 0)

249 goto out_bad;

250 } else {

251 info->max_bytes = 0;

252 info->max_files = 0;

253 }

254

255 info->pool_ns_len = 0;

256 info->pool_ns_data = NULL;

257 if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {

258 ceph_decode_32_safe(p, end, info->pool_ns_len, bad);

259 if (info->pool_ns_len > 0) {

260 ceph_decode_need(p, end, info->pool_ns_len, bad);

261 info->pool_ns_data = *p;

262 *p += info->pool_ns_len;

263 }

264 }

265

266 if (features & CEPH_FEATURE_FS_BTIME) {

267 ceph_decode_need(p, end, sizeof(info->btime), bad);

268 ceph_decode_copy(p, &info->btime, sizeof(info->btime));

269 ceph_decode_64_safe(p, end, info->change_attr, bad);

270 }

271

272 info->dir_pin = -ENODATA;

273 /* info->snap_btime and info->rsnaps remain zero */

274 }

275 return 0;

276 bad:

277 err = -EIO;

278 out_bad:

279 return err;

280 }

281

282 static int parse_reply_info_dir(void **p, void *end,

283 struct ceph_mds_reply_dirfrag **dirfrag,

284 u64 features)

285 {

286 if (features == (u64)-1) {

287 u8 struct_v, struct_compat;

288 u32 struct_len;

289 ceph_decode_8_safe(p, end, struct_v, bad);

290 ceph_decode_8_safe(p, end, struct_compat, bad);

291 /* struct_v is expected to be >= 1. we only understand

292 * encoding whose struct_compat == 1. */

293 if (!struct_v || struct_compat != 1)

294 goto bad;

295 ceph_decode_32_safe(p, end, struct_len, bad);

296 ceph_decode_need(p, end, struct_len, bad);

297 end = *p + struct_len;

298 }

299

300 ceph_decode_need(p, end, sizeof(**dirfrag), bad);

301 *dirfrag = *p;

302 *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);

303 if (unlikely(*p > end))

304 goto bad;

305 if (features == (u64)-1)

306 *p = end;

307 return 0;

308 bad:

309 return -EIO;

310 }

311

312 static int parse_reply_info_lease(void **p, void *end,

313 struct ceph_mds_reply_lease **lease,

314 u64 features, u32 *altname_len, u8 **altname)

315 {

316 u8 struct_v;

317 u32 struct_len;

318 void *lend;

319

320 if (features == (u64)-1) {

321 u8 struct_compat;

322

323 ceph_decode_8_safe(p, end, struct_v, bad);

324 ceph_decode_8_safe(p, end, struct_compat, bad);

325

326 /* struct_v is expected to be >= 1. we only understand

327 * encoding whose struct_compat == 1. */

328 if (!struct_v || struct_compat != 1)

329 goto bad;

330

331 ceph_decode_32_safe(p, end, struct_len, bad);

332 } else {

333 struct_len = sizeof(**lease);

334 *altname_len = 0;

335 *altname = NULL;

336 }

337

338 lend = *p + struct_len;

339 ceph_decode_need(p, end, struct_len, bad);

340 *lease = *p;

341 *p += sizeof(**lease);

342

343 if (features == (u64)-1) {

344 if (struct_v >= 2) {

345 ceph_decode_32_safe(p, end, *altname_len, bad);

346 ceph_decode_need(p, end, *altname_len, bad);

347 *altname = *p;

348 *p += *altname_len;

349 } else {

350 *altname = NULL;

351 *altname_len = 0;

352 }

353 }

354 *p = lend;

355 return 0;

356 bad:

357 return -EIO;

358 }

359

360 /*

361 * parse a normal reply, which may contain a (dir+)dentry and/or a

362 * target inode.

363 */

364 static int parse_reply_info_trace(void **p, void *end,

365 struct ceph_mds_reply_info_parsed *info,

366 u64 features)

367 {

368 int err;

369

370 if (info->head->is_dentry) {

371 err = parse_reply_info_in(p, end, &info->diri, features);

372 if (err < 0)

373 goto out_bad;

374

375 err = parse_reply_info_dir(p, end, &info->dirfrag, features);

376 if (err < 0)

377 goto out_bad;

378

379 ceph_decode_32_safe(p, end, info->dname_len, bad);

380 ceph_decode_need(p, end, info->dname_len, bad);

381 info->dname = *p;

382 *p += info->dname_len;

383

384 err = parse_reply_info_lease(p, end, &info->dlease, features,

385 &info->altname_len, &info->altname);

386 if (err < 0)

387 goto out_bad;

388 }

389

390 if (info->head->is_target) {

391 err = parse_reply_info_in(p, end, &info->targeti, features);

392 if (err < 0)

393 goto out_bad;

394 }

395

396 if (unlikely(*p != end))

397 goto bad;

398 return 0;

399

400 bad:

401 err = -EIO;

402 out_bad:

403 pr_err("problem parsing mds trace %d\n", err);

404 return err;

405 }

406

407 /*

408 * parse readdir results

409 */

410 static int parse_reply_info_readdir(void **p, void *end,

411 struct ceph_mds_request *req,

412 u64 features)

413 {

414 struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;

415 struct ceph_client *cl = req->r_mdsc->fsc->client;

416 u32 num, i = 0;

417 int err;

418

419 err = parse_reply_info_dir(p, end, &info->dir_dir, features);

420 if (err < 0)

421 goto out_bad;

422

423 ceph_decode_need(p, end, sizeof(num) + 2, bad);

424 num = ceph_decode_32(p);

425 {

426 u16 flags = ceph_decode_16(p);

427 info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);

428 info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);

429 info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);

430 info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);

431 }

432 if (num == 0)

433 goto done;

434

435 BUG_ON(!info->dir_entries);

436 if ((unsigned long)(info->dir_entries + num) >

437 (unsigned long)info->dir_entries + info->dir_buf_size) {

438 pr_err_client(cl, "dir contents are larger than expected\n");

439 WARN_ON(1);

440 goto bad;

441 }

442

443 info->dir_nr = num;

444 while (num) {

445 struct inode *inode = d_inode(req->r_dentry);

446 struct ceph_inode_info *ci = ceph_inode(inode);

447 struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;

448 struct fscrypt_str tname = FSTR_INIT(NULL, 0);

449 struct fscrypt_str oname = FSTR_INIT(NULL, 0);

450 struct ceph_fname fname;

451 u32 altname_len, _name_len;

452 u8 *altname, *_name;

453

454 /* dentry */

455 ceph_decode_32_safe(p, end, _name_len, bad);

456 ceph_decode_need(p, end, _name_len, bad);

457 _name = *p;

458 *p += _name_len;

459 doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name);

460

461 if (info->hash_order)

462 rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,

463 _name, _name_len);

464

465 /* dentry lease */

466 err = parse_reply_info_lease(p, end, &rde->lease, features,

467 &altname_len, &altname);

468 if (err)

469 goto out_bad;

470

471 /*

472 * Try to dencrypt the dentry names and update them

473 * in the ceph_mds_reply_dir_entry struct.

474 */

475 fname.dir = inode;

476 fname.name = _name;

477 fname.name_len = _name_len;

478 fname.ctext = altname;

479 fname.ctext_len = altname_len;

480 /*

481 * The _name_len maybe larger than altname_len, such as

482 * when the human readable name length is in range of

483 * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),

484 * then the copy in ceph_fname_to_usr will corrupt the

485 * data if there has no encryption key.

486 *

487 * Just set the no_copy flag and then if there has no

488 * encryption key the oname.name will be assigned to

489 * _name always.

490 */

491 fname.no_copy = true;

492 if (altname_len == 0) {

493 /*

494 * Set tname to _name, and this will be used

495 * to do the base64_decode in-place. It's

496 * safe because the decoded string should

497 * always be shorter, which is 3/4 of origin

498 * string.

499 */

500 tname.name = _name;

501

502 /*

503 * Set oname to _name too, and this will be

504 * used to do the dencryption in-place.

505 */

506 oname.name = _name;

507 oname.len = _name_len;

508 } else {

509 /*

510 * This will do the decryption only in-place

511 * from altname cryptext directly.

512 */

513 oname.name = altname;

514 oname.len = altname_len;

515 }

516 rde->is_nokey = false;

517 err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);

518 if (err) {

519 pr_err_client(cl, "unable to decode %.*s, got %d\n",

520 _name_len, _name, err);

521 goto out_bad;

522 }

523 rde->name = oname.name;

524 rde->name_len = oname.len;

525

526 /* inode */

527 err = parse_reply_info_in(p, end, &rde->inode, features);

528 if (err < 0)

529 goto out_bad;

530 /* ceph_readdir_prepopulate() will update it */

531 rde->offset = 0;

532 i++;

533 num--;

534 }

535

536 done:

537 /* Skip over any unrecognized fields */

538 *p = end;

539 return 0;

540

541 bad:

542 err = -EIO;

543 out_bad:

544 pr_err_client(cl, "problem parsing dir contents %d\n", err);

545 return err;

546 }

547

548 /*

549 * parse fcntl F_GETLK results

550 */

551 static int parse_reply_info_filelock(void **p, void *end,

552 struct ceph_mds_reply_info_parsed *info,

553 u64 features)

554 {

555 if (*p + sizeof(*info->filelock_reply) > end)

556 goto bad;

557

558 info->filelock_reply = *p;

559

560 /* Skip over any unrecognized fields */

561 *p = end;

562 return 0;

563 bad:

564 return -EIO;

565 }

566

567

568 #if BITS_PER_LONG == 64

569

570 #define DELEGATED_INO_AVAILABLE xa_mk_value(1)

571

572 static int ceph_parse_deleg_inos(void **p, void *end,

573 struct ceph_mds_session *s)

574 {

575 struct ceph_client *cl = s->s_mdsc->fsc->client;

576 u32 sets;

577

578 ceph_decode_32_safe(p, end, sets, bad);

579 doutc(cl, "got %u sets of delegated inodes\n", sets);

580 while (sets--) {

581 u64 start, len;

582

583 ceph_decode_64_safe(p, end, start, bad);

584 ceph_decode_64_safe(p, end, len, bad);

585

586 /* Don't accept a delegation of system inodes */

587 if (start < CEPH_INO_SYSTEM_BASE) {

588 pr_warn_ratelimited_client(cl,

589 "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",

590 start, len);

591 continue;

592 }

593 while (len--) {

594 int err = xa_insert(&s->s_delegated_inos, start++,

595 DELEGATED_INO_AVAILABLE,

596 GFP_KERNEL);

597 if (!err) {

598 doutc(cl, "added delegated inode 0x%llx\n", start - 1);

599 } else if (err == -EBUSY) {

600 pr_warn_client(cl,

601 "MDS delegated inode 0x%llx more than once.\n",

602 start - 1);

603 } else {

604 return err;

605 }

606 }

607 }

608 return 0;

609 bad:

610 return -EIO;

611 }

612

613 u64 ceph_get_deleg_ino(struct ceph_mds_session *s)

614 {

615 unsigned long ino;

616 void *val;

617

618 xa_for_each(&s->s_delegated_inos, ino, val) {

619 val = xa_erase(&s->s_delegated_inos, ino);

620 if (val == DELEGATED_INO_AVAILABLE)

621 return ino;

622 }

623 return 0;

624 }

625

626 int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)

627 {

628 return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,

629 GFP_KERNEL);

630 }

631 #else /* BITS_PER_LONG == 64 */

632 /*

633 * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just

634 * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top

635 * and bottom words?

636 */

637 static int ceph_parse_deleg_inos(void **p, void *end,

638 struct ceph_mds_session *s)

639 {

640 u32 sets;

641

642 ceph_decode_32_safe(p, end, sets, bad);

643 if (sets)

644 ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);

645 return 0;

646 bad:

647 return -EIO;

648 }

649

650 u64 ceph_get_deleg_ino(struct ceph_mds_session *s)

651 {

652 return 0;

653 }

654

655 int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)

656 {

657 return 0;

658 }

659 #endif /* BITS_PER_LONG == 64 */

660

661 /*

662 * parse create results

663 */

664 static int parse_reply_info_create(void **p, void *end,

665 struct ceph_mds_reply_info_parsed *info,

666 u64 features, struct ceph_mds_session *s)

667 {

668 int ret;

669

670 if (features == (u64)-1 ||

671 (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {

672 if (*p == end) {

673 /* Malformed reply? */

674 info->has_create_ino = false;

675 } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {

676 info->has_create_ino = true;

677 /* struct_v, struct_compat, and len */

678 ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);

679 ceph_decode_64_safe(p, end, info->ino, bad);

680 ret = ceph_parse_deleg_inos(p, end, s);

681 if (ret)

682 return ret;

683 } else {

684 /* legacy */

685 ceph_decode_64_safe(p, end, info->ino, bad);

686 info->has_create_ino = true;

687 }

688 } else {

689 if (*p != end)

690 goto bad;

691 }

692

693 /* Skip over any unrecognized fields */

694 *p = end;

695 return 0;

696 bad:

697 return -EIO;

698 }

699

700 static int parse_reply_info_getvxattr(void **p, void *end,

701 struct ceph_mds_reply_info_parsed *info,

702 u64 features)

703 {

704 u32 value_len;

705

706 ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */

707 ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */

708 ceph_decode_skip_32(p, end, bad); /* skip payload length */

709

710 ceph_decode_32_safe(p, end, value_len, bad);

711

712 if (value_len == end - *p) {

713 info->xattr_info.xattr_value = *p;

714 info->xattr_info.xattr_value_len = value_len;

715 *p = end;

716 return value_len;

717 }

718 bad:

719 return -EIO;

720 }

721

722 /*

723 * parse extra results

724 */

725 static int parse_reply_info_extra(void **p, void *end,

726 struct ceph_mds_request *req,

727 u64 features, struct ceph_mds_session *s)

728 {

729 struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;

730 u32 op = le32_to_cpu(info->head->op);

731

732 if (op == CEPH_MDS_OP_GETFILELOCK)

733 return parse_reply_info_filelock(p, end, info, features);

734 else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)

735 return parse_reply_info_readdir(p, end, req, features);

736 else if (op == CEPH_MDS_OP_CREATE)

737 return parse_reply_info_create(p, end, info, features, s);

738 else if (op == CEPH_MDS_OP_GETVXATTR)

739 return parse_reply_info_getvxattr(p, end, info, features);

740 else

741 return -EIO;

742 }

743

744 /*

745 * parse entire mds reply

746 */

747 static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,

748 struct ceph_mds_request *req, u64 features)

749 {

750 struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;

751 struct ceph_client *cl = s->s_mdsc->fsc->client;

752 void *p, *end;

753 u32 len;

754 int err;

755

756 info->head = msg->front.iov_base;

757 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);

758 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);

759

760 /* trace */

761 ceph_decode_32_safe(&p, end, len, bad);

762 if (len > 0) {

763 ceph_decode_need(&p, end, len, bad);

764 err = parse_reply_info_trace(&p, p+len, info, features);

765 if (err < 0)

766 goto out_bad;

767 }

768

769 /* extra */

770 ceph_decode_32_safe(&p, end, len, bad);

771 if (len > 0) {

772 ceph_decode_need(&p, end, len, bad);

773 err = parse_reply_info_extra(&p, p+len, req, features, s);

774 if (err < 0)

775 goto out_bad;

776 }

777

778 /* snap blob */

779 ceph_decode_32_safe(&p, end, len, bad);

780 info->snapblob_len = len;

781 info->snapblob = p;

782 p += len;

783

784 if (p != end)

785 goto bad;

786 return 0;

787

788 bad:

789 err = -EIO;

790 out_bad:

791 pr_err_client(cl, "mds parse_reply err %d\n", err);

792 ceph_msg_dump(msg);

793 return err;

794 }

795

796 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)

797 {

798 int i;

799

800 kfree(info->diri.fscrypt_auth);

801 kfree(info->diri.fscrypt_file);

802 kfree(info->targeti.fscrypt_auth);

803 kfree(info->targeti.fscrypt_file);

804 if (!info->dir_entries)

805 return;

806

807 for (i = 0; i < info->dir_nr; i++) {

808 struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;

809

810 kfree(rde->inode.fscrypt_auth);

811 kfree(rde->inode.fscrypt_file);

812 }

813 free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));

814 }

815

816 /*

817 * In async unlink case the kclient won't wait for the first reply

818 * from MDS and just drop all the links and unhash the dentry and then

819 * succeeds immediately.

820 *

821 * For any new create/link/rename,etc requests followed by using the

822 * same file names we must wait for the first reply of the inflight

823 * unlink request, or the MDS possibly will fail these following

824 * requests with -EEXIST if the inflight async unlink request was

825 * delayed for some reasons.

826 *

827 * And the worst case is that for the none async openc request it will

828 * successfully open the file if the CDentry hasn't been unlinked yet,

829 * but later the previous delayed async unlink request will remove the

830 * CDentry. That means the just created file is possibly deleted later

831 * by accident.

832 *

833 * We need to wait for the inflight async unlink requests to finish

834 * when creating new files/directories by using the same file names.

835 */

836 int ceph_wait_on_conflict_unlink(struct dentry *dentry)

837 {

838 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);

839 struct ceph_client *cl = fsc->client;

840 struct dentry *pdentry = dentry->d_parent;

841 struct dentry *udentry, *found = NULL;

842 struct ceph_dentry_info *di;

843 struct qstr dname;

844 u32 hash = dentry->d_name.hash;

845 int err;

846

847 dname.name = dentry->d_name.name;

848 dname.len = dentry->d_name.len;

849

850 rcu_read_lock();

851 hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,

852 hnode, hash) {

853 udentry = di->dentry;

854

855 spin_lock(&udentry->d_lock);

856 if (udentry->d_name.hash != hash)

857 goto next;

858 if (unlikely(udentry->d_parent != pdentry))

859 goto next;

860 if (!hash_hashed(&di->hnode))

861 goto next;

862

863 if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))

864 pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n",

865 dentry, dentry);

866

867 if (!d_same_name(udentry, pdentry, &dname))

868 goto next;

869

870 found = dget_dlock(udentry);

871 spin_unlock(&udentry->d_lock);

872 break;

873 next:

874 spin_unlock(&udentry->d_lock);

875 }

876 rcu_read_unlock();

877

878 if (likely(!found))

879 return 0;

880

881 doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry,

882 found, found);

883

884 err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,

885 TASK_KILLABLE);

886 dput(found);

887 return err;

888 }

889

890

891 /*

892 * sessions

893 */

894 const char *ceph_session_state_name(int s)

895 {

896 switch (s) {

897 case CEPH_MDS_SESSION_NEW: return "new";

898 case CEPH_MDS_SESSION_OPENING: return "opening";

899 case CEPH_MDS_SESSION_OPEN: return "open";

900 case CEPH_MDS_SESSION_HUNG: return "hung";

901 case CEPH_MDS_SESSION_CLOSING: return "closing";

902 case CEPH_MDS_SESSION_CLOSED: return "closed";

903 case CEPH_MDS_SESSION_RESTARTING: return "restarting";

904 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";

905 case CEPH_MDS_SESSION_REJECTED: return "rejected";

906 default: return "???";

907 }

908 }

909

910 struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)

911 {

912 if (refcount_inc_not_zero(&s->s_ref))

913 return s;

914 return NULL;

915 }

916

917 void ceph_put_mds_session(struct ceph_mds_session *s)

918 {

919 if (IS_ERR_OR_NULL(s))

920 return;

921

922 if (refcount_dec_and_test(&s->s_ref)) {

923 if (s->s_auth.authorizer)

924 ceph_auth_destroy_authorizer(s->s_auth.authorizer);

925 WARN_ON(mutex_is_locked(&s->s_mutex));

926 xa_destroy(&s->s_delegated_inos);

927 kfree(s);

928 }

929 }

930

931 /*

932 * called under mdsc->mutex

933 */

934 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,

935 int mds)

936 {

937 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])

938 return NULL;

939 return ceph_get_mds_session(mdsc->sessions[mds]);

940 }

941

942 static bool __have_session(struct ceph_mds_client *mdsc, int mds)

943 {

944 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])

945 return false;

946 else

947 return true;

948 }

949

950 static int __verify_registered_session(struct ceph_mds_client *mdsc,

951 struct ceph_mds_session *s)

952 {

953 if (s->s_mds >= mdsc->max_sessions ||

954 mdsc->sessions[s->s_mds] != s)

955 return -ENOENT;

956 return 0;

957 }

958

959 /*

960 * create+register a new session for given mds.

961 * called under mdsc->mutex.

962 */

963 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,

964 int mds)

965 {

966 struct ceph_client *cl = mdsc->fsc->client;

967 struct ceph_mds_session *s;

968

969 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)

970 return ERR_PTR(-EIO);

971

972 if (mds >= mdsc->mdsmap->possible_max_rank)

973 return ERR_PTR(-EINVAL);

974

975 s = kzalloc(sizeof(*s), GFP_NOFS);

976 if (!s)

977 return ERR_PTR(-ENOMEM);

978

979 if (mds >= mdsc->max_sessions) {

980 int newmax = 1 << get_count_order(mds + 1);

981 struct ceph_mds_session **sa;

982 size_t ptr_size = sizeof(struct ceph_mds_session *);

983

984 doutc(cl, "realloc to %d\n", newmax);

985 sa = kcalloc(newmax, ptr_size, GFP_NOFS);

986 if (!sa)

987 goto fail_realloc;

988 if (mdsc->sessions) {

989 memcpy(sa, mdsc->sessions,

990 mdsc->max_sessions * ptr_size);

991 kfree(mdsc->sessions);

992 }

993 mdsc->sessions = sa;

994 mdsc->max_sessions = newmax;

995 }

996

997 doutc(cl, "mds%d\n", mds);

998 s->s_mdsc = mdsc;

999 s->s_mds = mds;

1000 s->s_state = CEPH_MDS_SESSION_NEW;

1001 mutex_init(&s->s_mutex);

1002

1003 ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);

1004

1005 atomic_set(&s->s_cap_gen, 1);

1006 s->s_cap_ttl = jiffies - 1;

1007

1008 spin_lock_init(&s->s_cap_lock);

1009 INIT_LIST_HEAD(&s->s_caps);

1010 refcount_set(&s->s_ref, 1);

1011 INIT_LIST_HEAD(&s->s_waiting);

1012 INIT_LIST_HEAD(&s->s_unsafe);

1013 xa_init(&s->s_delegated_inos);

1014 INIT_LIST_HEAD(&s->s_cap_releases);

1015 INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);

1016

1017 INIT_LIST_HEAD(&s->s_cap_dirty);

1018 INIT_LIST_HEAD(&s->s_cap_flushing);

1019

1020 mdsc->sessions[mds] = s;

1021 atomic_inc(&mdsc->num_sessions);

1022 refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */

1023

1024 ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,

1025 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));

1026

1027 return s;

1028

1029 fail_realloc:

1030 kfree(s);

1031 return ERR_PTR(-ENOMEM);

1032 }

1033

1034 /*

1035 * called under mdsc->mutex

1036 */

1037 static void __unregister_session(struct ceph_mds_client *mdsc,

1038 struct ceph_mds_session *s)

1039 {

1040 doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s);

1041 BUG_ON(mdsc->sessions[s->s_mds] != s);

1042 mdsc->sessions[s->s_mds] = NULL;

1043 ceph_con_close(&s->s_con);

1044 ceph_put_mds_session(s);

1045 atomic_dec(&mdsc->num_sessions);

1046 }

1047

1048 /*

1049 * drop session refs in request.

1050 *

1051 * should be last request ref, or hold mdsc->mutex

1052 */

1053 static void put_request_session(struct ceph_mds_request *req)

1054 {

1055 if (req->r_session) {

1056 ceph_put_mds_session(req->r_session);

1057 req->r_session = NULL;

1058 }

1059 }

1060

1061 void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,

1062 void (*cb)(struct ceph_mds_session *),

1063 bool check_state)

1064 {

1065 int mds;

1066

1067 mutex_lock(&mdsc->mutex);

1068 for (mds = 0; mds < mdsc->max_sessions; ++mds) {

1069 struct ceph_mds_session *s;

1070

1071 s = __ceph_lookup_mds_session(mdsc, mds);

1072 if (!s)

1073 continue;

1074

1075 if (check_state && !check_session_state(s)) {

1076 ceph_put_mds_session(s);

1077 continue;

1078 }

1079

1080 mutex_unlock(&mdsc->mutex);

1081 cb(s);

1082 ceph_put_mds_session(s);

1083 mutex_lock(&mdsc->mutex);

1084 }

1085 mutex_unlock(&mdsc->mutex);

1086 }

1087

1088 void ceph_mdsc_release_request(struct kref *kref)

1089 {

1090 struct ceph_mds_request *req = container_of(kref,

1091 struct ceph_mds_request,

1092 r_kref);

1093 ceph_mdsc_release_dir_caps_async(req);

1094 destroy_reply_info(&req->r_reply_info);

1095 if (req->r_request)

1096 ceph_msg_put(req->r_request);

1097 if (req->r_reply)

1098 ceph_msg_put(req->r_reply);

1099 if (req->r_inode) {

1100 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);

1101 iput(req->r_inode);

1102 }

1103 if (req->r_parent) {

1104 ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);

1105 iput(req->r_parent);

1106 }

1107 iput(req->r_target_inode);

1108 iput(req->r_new_inode);

1109 if (req->r_dentry)

1110 dput(req->r_dentry);

1111 if (req->r_old_dentry)

1112 dput(req->r_old_dentry);

1113 if (req->r_old_dentry_dir) {

1114 /*

1115 * track (and drop pins for) r_old_dentry_dir

1116 * separately, since r_old_dentry's d_parent may have

1117 * changed between the dir mutex being dropped and

1118 * this request being freed.

1119 */

1120 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),

1121 CEPH_CAP_PIN);

1122 iput(req->r_old_dentry_dir);

1123 }

1124 kfree(req->r_path1);

1125 kfree(req->r_path2);

1126 put_cred(req->r_cred);

1127 if (req->r_mnt_idmap)

1128 mnt_idmap_put(req->r_mnt_idmap);

1129 if (req->r_pagelist)

1130 ceph_pagelist_release(req->r_pagelist);

1131 kfree(req->r_fscrypt_auth);

1132 kfree(req->r_altname);

1133 put_request_session(req);

1134 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);

1135 WARN_ON_ONCE(!list_empty(&req->r_wait));

1136 kmem_cache_free(ceph_mds_request_cachep, req);

1137 }

1138

1139 DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)

1140

1141 /*

1142 * lookup session, bump ref if found.

1143 *

1144 * called under mdsc->mutex.

1145 */

1146 static struct ceph_mds_request *

1147 lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)

1148 {

1149 struct ceph_mds_request *req;

1150

1151 req = lookup_request(&mdsc->request_tree, tid);

1152 if (req)

1153 ceph_mdsc_get_request(req);

1154

1155 return req;

1156 }

1157

1158 /*

1159 * Register an in-flight request, and assign a tid. Link to directory

1160 * are modifying (if any).

1161 *

1162 * Called under mdsc->mutex.

1163 */

1164 static void __register_request(struct ceph_mds_client *mdsc,

1165 struct ceph_mds_request *req,

1166 struct inode *dir)

1167 {

1168 struct ceph_client *cl = mdsc->fsc->client;

1169 int ret = 0;

1170

1171 req->r_tid = ++mdsc->last_tid;

1172 if (req->r_num_caps) {

1173 ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,

1174 req->r_num_caps);

1175 if (ret < 0) {

1176 pr_err_client(cl, "%p failed to reserve caps: %d\n",

1177 req, ret);

1178 /* set req->r_err to fail early from __do_request */

1179 req->r_err = ret;

1180 return;

1181 }

1182 }

1183 doutc(cl, "%p tid %lld\n", req, req->r_tid);

1184 ceph_mdsc_get_request(req);

1185 insert_request(&mdsc->request_tree, req);

1186

1187 req->r_cred = get_current_cred();

1188 if (!req->r_mnt_idmap)

1189 req->r_mnt_idmap = &nop_mnt_idmap;

1190

1191 if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)

1192 mdsc->oldest_tid = req->r_tid;

1193

1194 if (dir) {

1195 struct ceph_inode_info *ci = ceph_inode(dir);

1196

1197 ihold(dir);

1198 req->r_unsafe_dir = dir;

1199 spin_lock(&ci->i_unsafe_lock);

1200 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);

1201 spin_unlock(&ci->i_unsafe_lock);

1202 }

1203 }

1204

1205 static void __unregister_request(struct ceph_mds_client *mdsc,

1206 struct ceph_mds_request *req)

1207 {

1208 doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid);

1209

1210 /* Never leave an unregistered request on an unsafe list! */

1211 list_del_init(&req->r_unsafe_item);

1212

1213 if (req->r_tid == mdsc->oldest_tid) {

1214 struct rb_node *p = rb_next(&req->r_node);

1215 mdsc->oldest_tid = 0;

1216 while (p) {

1217 struct ceph_mds_request *next_req =

1218 rb_entry(p, struct ceph_mds_request, r_node);

1219 if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {

1220 mdsc->oldest_tid = next_req->r_tid;

1221 break;

1222 }

1223 p = rb_next(p);

1224 }

1225 }

1226

1227 erase_request(&mdsc->request_tree, req);

1228

1229 if (req->r_unsafe_dir) {

1230 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);

1231 spin_lock(&ci->i_unsafe_lock);

1232 list_del_init(&req->r_unsafe_dir_item);

1233 spin_unlock(&ci->i_unsafe_lock);

1234 }

1235 if (req->r_target_inode &&

1236 test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {

1237 struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);

1238 spin_lock(&ci->i_unsafe_lock);

1239 list_del_init(&req->r_unsafe_target_item);

1240 spin_unlock(&ci->i_unsafe_lock);

1241 }

1242

1243 if (req->r_unsafe_dir) {

1244 iput(req->r_unsafe_dir);

1245 req->r_unsafe_dir = NULL;

1246 }

1247

1248 complete_all(&req->r_safe_completion);

1249

1250 ceph_mdsc_put_request(req);

1251 }

1252

1253 /*

1254 * Walk back up the dentry tree until we hit a dentry representing a

1255 * non-snapshot inode. We do this using the rcu_read_lock (which must be held

1256 * when calling this) to ensure that the objects won't disappear while we're

1257 * working with them. Once we hit a candidate dentry, we attempt to take a

1258 * reference to it, and return that as the result.

1259 */

1260 static struct inode *get_nonsnap_parent(struct dentry *dentry)

1261 {

1262 struct inode *inode = NULL;

1263

1264 while (dentry && !IS_ROOT(dentry)) {

1265 inode = d_inode_rcu(dentry);

1266 if (!inode || ceph_snap(inode) == CEPH_NOSNAP)

1267 break;

1268 dentry = dentry->d_parent;

1269 }

1270 if (inode)

1271 inode = igrab(inode);

1272 return inode;

1273 }

1274

1275 /*

1276 * Choose mds to send request to next. If there is a hint set in the

1277 * request (e.g., due to a prior forward hint from the mds), use that.

1278 * Otherwise, consult frag tree and/or caps to identify the

1279 * appropriate mds. If all else fails, choose randomly.

1280 *

1281 * Called under mdsc->mutex.

1282 */

1283 static int __choose_mds(struct ceph_mds_client *mdsc,

1284 struct ceph_mds_request *req,

1285 bool *random)

1286 {

1287 struct inode *inode;

1288 struct ceph_inode_info *ci;

1289 struct ceph_cap *cap;

1290 int mode = req->r_direct_mode;

1291 int mds = -1;

1292 u32 hash = req->r_direct_hash;

1293 bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);

1294 struct ceph_client *cl = mdsc->fsc->client;

1295

1296 if (random)

1297 *random = false;

1298

1299 /*

1300 * is there a specific mds we should try? ignore hint if we have

1301 * no session and the mds is not up (active or recovering).

1302 */

1303 if (req->r_resend_mds >= 0 &&

1304 (__have_session(mdsc, req->r_resend_mds) ||

1305 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {

1306 doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds);

1307 return req->r_resend_mds;

1308 }

1309

1310 if (mode == USE_RANDOM_MDS)

1311 goto random;

1312

1313 inode = NULL;

1314 if (req->r_inode) {

1315 if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {

1316 inode = req->r_inode;

1317 ihold(inode);

1318 } else {

1319 /* req->r_dentry is non-null for LSSNAP request */

1320 rcu_read_lock();

1321 inode = get_nonsnap_parent(req->r_dentry);

1322 rcu_read_unlock();

1323 doutc(cl, "using snapdir's parent %p %llx.%llx\n",

1324 inode, ceph_vinop(inode));

1325 }

1326 } else if (req->r_dentry) {

1327 /* ignore race with rename; old or new d_parent is okay */

1328 struct dentry *parent;

1329 struct inode *dir;

1330

1331 rcu_read_lock();

1332 parent = READ_ONCE(req->r_dentry->d_parent);

1333 dir = req->r_parent ? : d_inode_rcu(parent);

1334

1335 if (!dir || dir->i_sb != mdsc->fsc->sb) {

1336 /* not this fs or parent went negative */

1337 inode = d_inode(req->r_dentry);

1338 if (inode)

1339 ihold(inode);

1340 } else if (ceph_snap(dir) != CEPH_NOSNAP) {

1341 /* direct snapped/virtual snapdir requests

1342 * based on parent dir inode */

1343 inode = get_nonsnap_parent(parent);

1344 doutc(cl, "using nonsnap parent %p %llx.%llx\n",

1345 inode, ceph_vinop(inode));

1346 } else {

1347 /* dentry target */

1348 inode = d_inode(req->r_dentry);

1349 if (!inode || mode == USE_AUTH_MDS) {

1350 /* dir + name */

1351 inode = igrab(dir);

1352 hash = ceph_dentry_hash(dir, req->r_dentry);

1353 is_hash = true;

1354 } else {

1355 ihold(inode);

1356 }

1357 }

1358 rcu_read_unlock();

1359 }

1360

1361 if (!inode)

1362 goto random;

1363

1364 doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode,

1365 ceph_vinop(inode), (int)is_hash, hash, mode);

1366 ci = ceph_inode(inode);

1367

1368 if (is_hash && S_ISDIR(inode->i_mode)) {

1369 struct ceph_inode_frag frag;

1370 int found;

1371

1372 ceph_choose_frag(ci, hash, &frag, &found);

1373 if (found) {

1374 if (mode == USE_ANY_MDS && frag.ndist > 0) {

1375 u8 r;

1376

1377 /* choose a random replica */

1378 get_random_bytes(&r, 1);

1379 r %= frag.ndist;

1380 mds = frag.dist[r];

1381 doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n",

1382 inode, ceph_vinop(inode), frag.frag,

1383 mds, (int)r, frag.ndist);

1384 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=

1385 CEPH_MDS_STATE_ACTIVE &&

1386 !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))

1387 goto out;

1388 }

1389

1390 /* since this file/dir wasn't known to be

1391 * replicated, then we want to look for the

1392 * authoritative mds. */

1393 if (frag.mds >= 0) {

1394 /* choose auth mds */

1395 mds = frag.mds;

1396 doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n",

1397 inode, ceph_vinop(inode), frag.frag, mds);

1398 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=

1399 CEPH_MDS_STATE_ACTIVE) {

1400 if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,

1401 mds))

1402 goto out;

1403 }

1404 }

1405 mode = USE_AUTH_MDS;

1406 }

1407 }

1408

1409 spin_lock(&ci->i_ceph_lock);

1410 cap = NULL;

1411 if (mode == USE_AUTH_MDS)

1412 cap = ci->i_auth_cap;

1413 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))

1414 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);

1415 if (!cap) {

1416 spin_unlock(&ci->i_ceph_lock);

1417 iput(inode);

1418 goto random;

1419 }

1420 mds = cap->session->s_mds;

1421 doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode,

1422 ceph_vinop(inode), mds,

1423 cap == ci->i_auth_cap ? "auth " : "", cap);

1424 spin_unlock(&ci->i_ceph_lock);

1425 out:

1426 iput(inode);

1427 return mds;

1428

1429 random:

1430 if (random)

1431 *random = true;

1432

1433 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);

1434 doutc(cl, "chose random mds%d\n", mds);

1435 return mds;

1436 }

1437

1438

1439 /*

1440 * session messages

1441 */

1442 struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)

1443 {

1444 struct ceph_msg *msg;

1445 struct ceph_mds_session_head *h;

1446

1447 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,

1448 false);

1449 if (!msg) {

1450 pr_err("ENOMEM creating session %s msg\n",

1451 ceph_session_op_name(op));

1452 return NULL;

1453 }

1454 h = msg->front.iov_base;

1455 h->op = cpu_to_le32(op);

1456 h->seq = cpu_to_le64(seq);

1457

1458 return msg;

1459 }

1460

1461 static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;

1462 #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)

1463 static int encode_supported_features(void **p, void *end)

1464 {

1465 static const size_t count = ARRAY_SIZE(feature_bits);

1466

1467 if (count > 0) {

1468 size_t i;

1469 size_t size = FEATURE_BYTES(count);

1470 unsigned long bit;

1471

1472 if (WARN_ON_ONCE(*p + 4 + size > end))

1473 return -ERANGE;

1474

1475 ceph_encode_32(p, size);

1476 memset(*p, 0, size);

1477 for (i = 0; i < count; i++) {

1478 bit = feature_bits[i];

1479 ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);

1480 }

1481 *p += size;

1482 } else {

1483 if (WARN_ON_ONCE(*p + 4 > end))

1484 return -ERANGE;

1485

1486 ceph_encode_32(p, 0);

1487 }

1488

1489 return 0;

1490 }

1491

1492 static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;

1493 #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)

1494 static int encode_metric_spec(void **p, void *end)

1495 {

1496 static const size_t count = ARRAY_SIZE(metric_bits);

1497

1498 /* header */

1499 if (WARN_ON_ONCE(*p + 2 > end))

1500 return -ERANGE;

1501

1502 ceph_encode_8(p, 1); /* version */

1503 ceph_encode_8(p, 1); /* compat */

1504

1505 if (count > 0) {

1506 size_t i;

1507 size_t size = METRIC_BYTES(count);

1508

1509 if (WARN_ON_ONCE(*p + 4 + 4 + size > end))

1510 return -ERANGE;

1511

1512 /* metric spec info length */

1513 ceph_encode_32(p, 4 + size);

1514

1515 /* metric spec */

1516 ceph_encode_32(p, size);

1517 memset(*p, 0, size);

1518 for (i = 0; i < count; i++)

1519 ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);

1520 *p += size;

1521 } else {

1522 if (WARN_ON_ONCE(*p + 4 + 4 > end))

1523 return -ERANGE;

1524

1525 /* metric spec info length */

1526 ceph_encode_32(p, 4);

1527 /* metric spec */

1528 ceph_encode_32(p, 0);

1529 }

1530

1531 return 0;

1532 }

1533

1534 /*

1535 * session message, specialization for CEPH_SESSION_REQUEST_OPEN

1536 * to include additional client metadata fields.

1537 */

1538 static struct ceph_msg *

1539 create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)

1540 {

1541 struct ceph_msg *msg;

1542 struct ceph_mds_session_head *h;

1543 int i;

1544 int extra_bytes = 0;

1545 int metadata_key_count = 0;

1546 struct ceph_options *opt = mdsc->fsc->client->options;

1547 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;

1548 struct ceph_client *cl = mdsc->fsc->client;

1549 size_t size, count;

1550 void *p, *end;

1551 int ret;

1552

1553 const char* metadata[][2] = {

1554 {"hostname", mdsc->nodename},

1555 {"kernel_version", init_utsname()->release},

1556 {"entity_id", opt->name ? : ""},

1557 {"root", fsopt->server_path ? : "/"},

1558 {NULL, NULL}

1559 };

1560

1561 /* Calculate serialized length of metadata */

1562 extra_bytes = 4; /* map length */

1563 for (i = 0; metadata[i][0]; ++i) {

1564 extra_bytes += 8 + strlen(metadata[i][0]) +

1565 strlen(metadata[i][1]);

1566 metadata_key_count++;

1567 }

1568

1569 /* supported feature */

1570 size = 0;

1571 count = ARRAY_SIZE(feature_bits);

1572 if (count > 0)

1573 size = FEATURE_BYTES(count);

1574 extra_bytes += 4 + size;

1575

1576 /* metric spec */

1577 size = 0;

1578 count = ARRAY_SIZE(metric_bits);

1579 if (count > 0)

1580 size = METRIC_BYTES(count);

1581 extra_bytes += 2 + 4 + 4 + size;

1582

1583 /* flags, mds auth caps and oldest_client_tid */

1584 extra_bytes += 4 + 4 + 8;

1585

1586 /* Allocate the message */

1587 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,

1588 GFP_NOFS, false);

1589 if (!msg) {

1590 pr_err_client(cl, "ENOMEM creating session open msg\n");

1591 return ERR_PTR(-ENOMEM);

1592 }

1593 p = msg->front.iov_base;

1594 end = p + msg->front.iov_len;

1595

1596 h = p;

1597 h->op = cpu_to_le32(op);

1598 h->seq = cpu_to_le64(seq);

1599

1600 /*

1601 * Serialize client metadata into waiting buffer space, using

1602 * the format that userspace expects for map<string, string>

1603 *

1604 * ClientSession messages with metadata are v7

1605 */

1606 msg->hdr.version = cpu_to_le16(7);

1607 msg->hdr.compat_version = cpu_to_le16(1);

1608

1609 /* The write pointer, following the session_head structure */

1610 p += sizeof(*h);

1611

1612 /* Number of entries in the map */

1613 ceph_encode_32(&p, metadata_key_count);

1614

1615 /* Two length-prefixed strings for each entry in the map */

1616 for (i = 0; metadata[i][0]; ++i) {

1617 size_t const key_len = strlen(metadata[i][0]);

1618 size_t const val_len = strlen(metadata[i][1]);

1619

1620 ceph_encode_32(&p, key_len);

1621 memcpy(p, metadata[i][0], key_len);

1622 p += key_len;

1623 ceph_encode_32(&p, val_len);

1624 memcpy(p, metadata[i][1], val_len);

1625 p += val_len;

1626 }

1627

1628 ret = encode_supported_features(&p, end);

1629 if (ret) {

1630 pr_err_client(cl, "encode_supported_features failed!\n");

1631 ceph_msg_put(msg);

1632 return ERR_PTR(ret);

1633 }

1634

1635 ret = encode_metric_spec(&p, end);

1636 if (ret) {

1637 pr_err_client(cl, "encode_metric_spec failed!\n");

1638 ceph_msg_put(msg);

1639 return ERR_PTR(ret);

1640 }

1641

1642 /* version == 5, flags */

1643 ceph_encode_32(&p, 0);

1644

1645 /* version == 6, mds auth caps */

1646 ceph_encode_32(&p, 0);

1647

1648 /* version == 7, oldest_client_tid */

1649 ceph_encode_64(&p, mdsc->oldest_tid);

1650

1651 msg->front.iov_len = p - msg->front.iov_base;

1652 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);

1653

1654 return msg;

1655 }

1656

1657 /*

1658 * send session open request.

1659 *

1660 * called under mdsc->mutex

1661 */

1662 static int __open_session(struct ceph_mds_client *mdsc,

1663 struct ceph_mds_session *session)

1664 {

1665 struct ceph_msg *msg;

1666 int mstate;

1667 int mds = session->s_mds;

1668

1669 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)

1670 return -EIO;

1671

1672 /* wait for mds to go active? */

1673 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);

1674 doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds,

1675 ceph_mds_state_name(mstate));

1676 session->s_state = CEPH_MDS_SESSION_OPENING;

1677 session->s_renew_requested = jiffies;

1678

1679 /* send connect message */

1680 msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,

1681 session->s_seq);

1682 if (IS_ERR(msg))

1683 return PTR_ERR(msg);

1684 ceph_con_send(&session->s_con, msg);

1685 return 0;

1686 }

1687

1688 /*

1689 * open sessions for any export targets for the given mds

1690 *

1691 * called under mdsc->mutex

1692 */

1693 static struct ceph_mds_session *

1694 __open_export_target_session(struct ceph_mds_client *mdsc, int target)

1695 {

1696 struct ceph_mds_session *session;

1697 int ret;

1698

1699 session = __ceph_lookup_mds_session(mdsc, target);

1700 if (!session) {

1701 session = register_session(mdsc, target);

1702 if (IS_ERR(session))

1703 return session;

1704 }

1705 if (session->s_state == CEPH_MDS_SESSION_NEW ||

1706 session->s_state == CEPH_MDS_SESSION_CLOSING) {

1707 ret = __open_session(mdsc, session);

1708 if (ret)

1709 return ERR_PTR(ret);

1710 }

1711

1712 return session;

1713 }

1714

1715 struct ceph_mds_session *

1716 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)

1717 {

1718 struct ceph_mds_session *session;

1719 struct ceph_client *cl = mdsc->fsc->client;

1720

1721 doutc(cl, "to mds%d\n", target);

1722

1723 mutex_lock(&mdsc->mutex);

1724 session = __open_export_target_session(mdsc, target);

1725 mutex_unlock(&mdsc->mutex);

1726

1727 return session;

1728 }

1729

1730 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,

1731 struct ceph_mds_session *session)

1732 {

1733 struct ceph_mds_info *mi;

1734 struct ceph_mds_session *ts;

1735 int i, mds = session->s_mds;

1736 struct ceph_client *cl = mdsc->fsc->client;

1737

1738 if (mds >= mdsc->mdsmap->possible_max_rank)

1739 return;

1740

1741 mi = &mdsc->mdsmap->m_info[mds];

1742 doutc(cl, "for mds%d (%d targets)\n", session->s_mds,

1743 mi->num_export_targets);

1744

1745 for (i = 0; i < mi->num_export_targets; i++) {

1746 ts = __open_export_target_session(mdsc, mi->export_targets[i]);

1747 ceph_put_mds_session(ts);

1748 }

1749 }

1750

1751 /*

1752 * session caps

1753 */

1754

1755 static void detach_cap_releases(struct ceph_mds_session *session,

1756 struct list_head *target)

1757 {

1758 struct ceph_client *cl = session->s_mdsc->fsc->client;

1759

1760 lockdep_assert_held(&session->s_cap_lock);

1761

1762 list_splice_init(&session->s_cap_releases, target);

1763 session->s_num_cap_releases = 0;

1764 doutc(cl, "mds%d\n", session->s_mds);

1765 }

1766

1767 static void dispose_cap_releases(struct ceph_mds_client *mdsc,

1768 struct list_head *dispose)

1769 {

1770 while (!list_empty(dispose)) {

1771 struct ceph_cap *cap;

1772 /* zero out the in-progress message */

1773 cap = list_first_entry(dispose, struct ceph_cap, session_caps);

1774 list_del(&cap->session_caps);

1775 ceph_put_cap(mdsc, cap);

1776 }

1777 }

1778

1779 static void cleanup_session_requests(struct ceph_mds_client *mdsc,

1780 struct ceph_mds_session *session)

1781 {

1782 struct ceph_client *cl = mdsc->fsc->client;

1783 struct ceph_mds_request *req;

1784 struct rb_node *p;

1785

1786 doutc(cl, "mds%d\n", session->s_mds);

1787 mutex_lock(&mdsc->mutex);

1788 while (!list_empty(&session->s_unsafe)) {

1789 req = list_first_entry(&session->s_unsafe,

1790 struct ceph_mds_request, r_unsafe_item);

1791 pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n",

1792 req->r_tid);

1793 if (req->r_target_inode)

1794 mapping_set_error(req->r_target_inode->i_mapping, -EIO);

1795 if (req->r_unsafe_dir)

1796 mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);

1797 __unregister_request(mdsc, req);

1798 }

1799 /* zero r_attempts, so kick_requests() will re-send requests */

1800 p = rb_first(&mdsc->request_tree);

1801 while (p) {

1802 req = rb_entry(p, struct ceph_mds_request, r_node);

1803 p = rb_next(p);

1804 if (req->r_session &&

1805 req->r_session->s_mds == session->s_mds)

1806 req->r_attempts = 0;

1807 }

1808 mutex_unlock(&mdsc->mutex);

1809 }

1810

1811 /*

1812 * Helper to safely iterate over all caps associated with a session, with

1813 * special care taken to handle a racing __ceph_remove_cap().

1814 *

1815 * Caller must hold session s_mutex.

1816 */

1817 int ceph_iterate_session_caps(struct ceph_mds_session *session,

1818 int (*cb)(struct inode *, int mds, void *),

1819 void *arg)

1820 {

1821 struct ceph_client *cl = session->s_mdsc->fsc->client;

1822 struct list_head *p;

1823 struct ceph_cap *cap;

1824 struct inode *inode, *last_inode = NULL;

1825 struct ceph_cap *old_cap = NULL;

1826 int ret;

1827

1828 doutc(cl, "%p mds%d\n", session, session->s_mds);

1829 spin_lock(&session->s_cap_lock);

1830 p = session->s_caps.next;

1831 while (p != &session->s_caps) {

1832 int mds;

1833

1834 cap = list_entry(p, struct ceph_cap, session_caps);

1835 inode = igrab(&cap->ci->netfs.inode);

1836 if (!inode) {

1837 p = p->next;

1838 continue;

1839 }

1840 session->s_cap_iterator = cap;

1841 mds = cap->mds;

1842 spin_unlock(&session->s_cap_lock);

1843

1844 if (last_inode) {

1845 iput(last_inode);

1846 last_inode = NULL;

1847 }

1848 if (old_cap) {

1849 ceph_put_cap(session->s_mdsc, old_cap);

1850 old_cap = NULL;

1851 }

1852

1853 ret = cb(inode, mds, arg);

1854 last_inode = inode;

1855

1856 spin_lock(&session->s_cap_lock);

1857 p = p->next;

1858 if (!cap->ci) {

1859 doutc(cl, "finishing cap %p removal\n", cap);

1860 BUG_ON(cap->session != session);

1861 cap->session = NULL;

1862 list_del_init(&cap->session_caps);

1863 session->s_nr_caps--;

1864 atomic64_dec(&session->s_mdsc->metric.total_caps);

1865 if (cap->queue_release)

1866 __ceph_queue_cap_release(session, cap);

1867 else

1868 old_cap = cap; /* put_cap it w/o locks held */

1869 }

1870 if (ret < 0)

1871 goto out;

1872 }

1873 ret = 0;

1874 out:

1875 session->s_cap_iterator = NULL;

1876 spin_unlock(&session->s_cap_lock);

1877

1878 iput(last_inode);

1879 if (old_cap)

1880 ceph_put_cap(session->s_mdsc, old_cap);

1881

1882 return ret;

1883 }

1884

1885 static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)

1886 {

1887 struct ceph_inode_info *ci = ceph_inode(inode);

1888 struct ceph_client *cl = ceph_inode_to_client(inode);

1889 bool invalidate = false;

1890 struct ceph_cap *cap;

1891 int iputs = 0;

1892

1893 spin_lock(&ci->i_ceph_lock);

1894 cap = __get_cap_for_mds(ci, mds);

1895 if (cap) {

1896 doutc(cl, " removing cap %p, ci is %p, inode is %p\n",

1897 cap, ci, &ci->netfs.inode);

1898

1899 iputs = ceph_purge_inode_cap(inode, cap, &invalidate);

1900 }

1901 spin_unlock(&ci->i_ceph_lock);

1902

1903 if (cap)

1904 wake_up_all(&ci->i_cap_wq);

1905 if (invalidate)

1906 ceph_queue_invalidate(inode);

1907 while (iputs--)

1908 iput(inode);

1909 return 0;

1910 }

1911

1912 /*

1913 * caller must hold session s_mutex

1914 */

1915 static void remove_session_caps(struct ceph_mds_session *session)

1916 {

1917 struct ceph_fs_client *fsc = session->s_mdsc->fsc;

1918 struct super_block *sb = fsc->sb;

1919 LIST_HEAD(dispose);

1920

1921 doutc(fsc->client, "on %p\n", session);

1922 ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);

1923

1924 wake_up_all(&fsc->mdsc->cap_flushing_wq);

1925

1926 spin_lock(&session->s_cap_lock);

1927 if (session->s_nr_caps > 0) {

1928 struct inode *inode;

1929 struct ceph_cap *cap, *prev = NULL;

1930 struct ceph_vino vino;

1931 /*

1932 * iterate_session_caps() skips inodes that are being

1933 * deleted, we need to wait until deletions are complete.

1934 * __wait_on_freeing_inode() is designed for the job,

1935 * but it is not exported, so use lookup inode function

1936 * to access it.

1937 */

1938 while (!list_empty(&session->s_caps)) {

1939 cap = list_entry(session->s_caps.next,

1940 struct ceph_cap, session_caps);

1941 if (cap == prev)

1942 break;

1943 prev = cap;

1944 vino = cap->ci->i_vino;

1945 spin_unlock(&session->s_cap_lock);

1946

1947 inode = ceph_find_inode(sb, vino);

1948 iput(inode);

1949

1950 spin_lock(&session->s_cap_lock);

1951 }

1952 }

1953

1954 // drop cap expires and unlock s_cap_lock

1955 detach_cap_releases(session, &dispose);

1956

1957 BUG_ON(session->s_nr_caps > 0);

1958 BUG_ON(!list_empty(&session->s_cap_flushing));

1959 spin_unlock(&session->s_cap_lock);

1960 dispose_cap_releases(session->s_mdsc, &dispose);

1961 }

1962

1963 enum {

1964 RECONNECT,

1965 RENEWCAPS,

1966 FORCE_RO,

1967 };

1968

1969 /*

1970 * wake up any threads waiting on this session's caps. if the cap is

1971 * old (didn't get renewed on the client reconnect), remove it now.

1972 *

1973 * caller must hold s_mutex.

1974 */

1975 static int wake_up_session_cb(struct inode *inode, int mds, void *arg)

1976 {

1977 struct ceph_inode_info *ci = ceph_inode(inode);

1978 unsigned long ev = (unsigned long)arg;

1979

1980 if (ev == RECONNECT) {

1981 spin_lock(&ci->i_ceph_lock);

1982 ci->i_wanted_max_size = 0;

1983 ci->i_requested_max_size = 0;

1984 spin_unlock(&ci->i_ceph_lock);

1985 } else if (ev == RENEWCAPS) {

1986 struct ceph_cap *cap;

1987

1988 spin_lock(&ci->i_ceph_lock);

1989 cap = __get_cap_for_mds(ci, mds);

1990 /* mds did not re-issue stale cap */

1991 if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))

1992 cap->issued = cap->implemented = CEPH_CAP_PIN;

1993 spin_unlock(&ci->i_ceph_lock);

1994 } else if (ev == FORCE_RO) {

1995 }

1996 wake_up_all(&ci->i_cap_wq);

1997 return 0;

1998 }

1999

2000 static void wake_up_session_caps(struct ceph_mds_session *session, int ev)

2001 {

2002 struct ceph_client *cl = session->s_mdsc->fsc->client;

2003

2004 doutc(cl, "session %p mds%d\n", session, session->s_mds);

2005 ceph_iterate_session_caps(session, wake_up_session_cb,

2006 (void *)(unsigned long)ev);

2007 }

2008

2009 /*

2010 * Send periodic message to MDS renewing all currently held caps. The

2011 * ack will reset the expiration for all caps from this session.

2012 *

2013 * caller holds s_mutex

2014 */

2015 static int send_renew_caps(struct ceph_mds_client *mdsc,

2016 struct ceph_mds_session *session)

2017 {

2018 struct ceph_client *cl = mdsc->fsc->client;

2019 struct ceph_msg *msg;

2020 int state;

2021

2022 if (time_after_eq(jiffies, session->s_cap_ttl) &&

2023 time_after_eq(session->s_cap_ttl, session->s_renew_requested))

2024 pr_info_client(cl, "mds%d caps stale\n", session->s_mds);

2025 session->s_renew_requested = jiffies;

2026

2027 /* do not try to renew caps until a recovering mds has reconnected

2028 * with its clients. */

2029 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);

2030 if (state < CEPH_MDS_STATE_RECONNECT) {

2031 doutc(cl, "ignoring mds%d (%s)\n", session->s_mds,

2032 ceph_mds_state_name(state));

2033 return 0;

2034 }

2035

2036 doutc(cl, "to mds%d (%s)\n", session->s_mds,

2037 ceph_mds_state_name(state));

2038 msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,

2039 ++session->s_renew_seq);

2040 if (IS_ERR(msg))

2041 return PTR_ERR(msg);

2042 ceph_con_send(&session->s_con, msg);

2043 return 0;

2044 }

2045

2046 static int send_flushmsg_ack(struct ceph_mds_client *mdsc,

2047 struct ceph_mds_session *session, u64 seq)

2048 {

2049 struct ceph_client *cl = mdsc->fsc->client;

2050 struct ceph_msg *msg;

2051

2052 doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds,

2053 ceph_session_state_name(session->s_state), seq);

2054 msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);

2055 if (!msg)

2056 return -ENOMEM;

2057 ceph_con_send(&session->s_con, msg);

2058 return 0;

2059 }

2060

2061

2062 /*

2063 * Note new cap ttl, and any transition from stale -> not stale (fresh?).

2064 *

2065 * Called under session->s_mutex

2066 */

2067 static void renewed_caps(struct ceph_mds_client *mdsc,

2068 struct ceph_mds_session *session, int is_renew)

2069 {

2070 struct ceph_client *cl = mdsc->fsc->client;

2071 int was_stale;

2072 int wake = 0;

2073

2074 spin_lock(&session->s_cap_lock);

2075 was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);

2076

2077 session->s_cap_ttl = session->s_renew_requested +

2078 mdsc->mdsmap->m_session_timeout*HZ;

2079

2080 if (was_stale) {

2081 if (time_before(jiffies, session->s_cap_ttl)) {

2082 pr_info_client(cl, "mds%d caps renewed\n",

2083 session->s_mds);

2084 wake = 1;

2085 } else {

2086 pr_info_client(cl, "mds%d caps still stale\n",

2087 session->s_mds);

2088 }

2089 }

2090 doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds,

2091 session->s_cap_ttl, was_stale ? "stale" : "fresh",

2092 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");

2093 spin_unlock(&session->s_cap_lock);

2094

2095 if (wake)

2096 wake_up_session_caps(session, RENEWCAPS);

2097 }

2098

2099 /*

2100 * send a session close request

2101 */

2102 static int request_close_session(struct ceph_mds_session *session)

2103 {

2104 struct ceph_client *cl = session->s_mdsc->fsc->client;

2105 struct ceph_msg *msg;

2106

2107 doutc(cl, "mds%d state %s seq %lld\n", session->s_mds,

2108 ceph_session_state_name(session->s_state), session->s_seq);

2109 msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,

2110 session->s_seq);

2111 if (!msg)

2112 return -ENOMEM;

2113 ceph_con_send(&session->s_con, msg);

2114 return 1;

2115 }

2116

2117 /*

2118 * Called with s_mutex held.

2119 */

2120 static int __close_session(struct ceph_mds_client *mdsc,

2121 struct ceph_mds_session *session)

2122 {

2123 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)

2124 return 0;

2125 session->s_state = CEPH_MDS_SESSION_CLOSING;

2126 return request_close_session(session);

2127 }

2128

2129 static bool drop_negative_children(struct dentry *dentry)

2130 {

2131 struct dentry *child;

2132 bool all_negative = true;

2133

2134 if (!d_is_dir(dentry))

2135 goto out;

2136

2137 spin_lock(&dentry->d_lock);

2138 hlist_for_each_entry(child, &dentry->d_children, d_sib) {

2139 if (d_really_is_positive(child)) {

2140 all_negative = false;

2141 break;

2142 }

2143 }

2144 spin_unlock(&dentry->d_lock);

2145

2146 if (all_negative)

2147 shrink_dcache_parent(dentry);

2148 out:

2149 return all_negative;

2150 }

2151

2152 /*

2153 * Trim old(er) caps.

2154 *

2155 * Because we can't cache an inode without one or more caps, we do

2156 * this indirectly: if a cap is unused, we prune its aliases, at which

2157 * point the inode will hopefully get dropped to.

2158 *

2159 * Yes, this is a bit sloppy. Our only real goal here is to respond to

2160 * memory pressure from the MDS, though, so it needn't be perfect.

2161 */

2162 static int trim_caps_cb(struct inode *inode, int mds, void *arg)

2163 {

2164 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);

2165 struct ceph_client *cl = mdsc->fsc->client;

2166 int *remaining = arg;

2167 struct ceph_inode_info *ci = ceph_inode(inode);

2168 int used, wanted, oissued, mine;

2169 struct ceph_cap *cap;

2170

2171 if (*remaining <= 0)

2172 return -1;

2173

2174 spin_lock(&ci->i_ceph_lock);

2175 cap = __get_cap_for_mds(ci, mds);

2176 if (!cap) {

2177 spin_unlock(&ci->i_ceph_lock);

2178 return 0;

2179 }

2180 mine = cap->issued | cap->implemented;

2181 used = __ceph_caps_used(ci);

2182 wanted = __ceph_caps_file_wanted(ci);

2183 oissued = __ceph_caps_issued_other(ci, cap);

2184

2185 doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n",

2186 inode, ceph_vinop(inode), cap, ceph_cap_string(mine),

2187 ceph_cap_string(oissued), ceph_cap_string(used),

2188 ceph_cap_string(wanted));

2189 if (cap == ci->i_auth_cap) {

2190 if (ci->i_dirty_caps || ci->i_flushing_caps ||

2191 !list_empty(&ci->i_cap_snaps))

2192 goto out;

2193 if ((used | wanted) & CEPH_CAP_ANY_WR)

2194 goto out;

2195 /* Note: it's possible that i_filelock_ref becomes non-zero

2196 * after dropping auth caps. It doesn't hurt because reply

2197 * of lock mds request will re-add auth caps. */

2198 if (atomic_read(&ci->i_filelock_ref) > 0)

2199 goto out;

2200 }

2201 /* The inode has cached pages, but it's no longer used.

2202 * we can safely drop it */

2203 if (S_ISREG(inode->i_mode) &&

2204 wanted == 0 && used == CEPH_CAP_FILE_CACHE &&

2205 !(oissued & CEPH_CAP_FILE_CACHE)) {

2206 used = 0;

2207 oissued = 0;

2208 }

2209 if ((used | wanted) & ~oissued & mine)

2210 goto out; /* we need these caps */

2211

2212 if (oissued) {

2213 /* we aren't the only cap.. just remove us */

2214 ceph_remove_cap(mdsc, cap, true);

2215 (*remaining)--;

2216 } else {

2217 struct dentry *dentry;

2218 /* try dropping referring dentries */

2219 spin_unlock(&ci->i_ceph_lock);

2220 dentry = d_find_any_alias(inode);

2221 if (dentry && drop_negative_children(dentry)) {

2222 int count;

2223 dput(dentry);

2224 d_prune_aliases(inode);

2225 count = icount_read(inode);

2226 if (count == 1)

2227 (*remaining)--;

2228 doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",

2229 inode, ceph_vinop(inode), cap, count);

2230 } else {

2231 dput(dentry);

2232 }

2233 return 0;

2234 }

2235

2236 out:

2237 spin_unlock(&ci->i_ceph_lock);

2238 return 0;

2239 }

2240

2241 /*

2242 * Trim session cap count down to some max number.

2243 */

2244 int ceph_trim_caps(struct ceph_mds_client *mdsc,

2245 struct ceph_mds_session *session,

2246 int max_caps)

2247 {

2248 struct ceph_client *cl = mdsc->fsc->client;

2249 int trim_caps = session->s_nr_caps - max_caps;

2250

2251 doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds,

2252 session->s_nr_caps, max_caps, trim_caps);

2253 if (trim_caps > 0) {

2254 int remaining = trim_caps;

2255

2256 ceph_iterate_session_caps(session, trim_caps_cb, &remaining);

2257 doutc(cl, "mds%d done: %d / %d, trimmed %d\n",

2258 session->s_mds, session->s_nr_caps, max_caps,

2259 trim_caps - remaining);

2260 }

2261

2262 ceph_flush_session_cap_releases(mdsc, session);

2263 return 0;

2264 }

2265

2266 static int check_caps_flush(struct ceph_mds_client *mdsc,

2267 u64 want_flush_tid)

2268 {

2269 struct ceph_client *cl = mdsc->fsc->client;

2270 int ret = 1;

2271

2272 spin_lock(&mdsc->cap_dirty_lock);

2273 if (!list_empty(&mdsc->cap_flush_list)) {

2274 struct ceph_cap_flush *cf =

2275 list_first_entry(&mdsc->cap_flush_list,

2276 struct ceph_cap_flush, g_list);

2277 if (cf->tid <= want_flush_tid) {

2278 doutc(cl, "still flushing tid %llu <= %llu\n",

2279 cf->tid, want_flush_tid);

2280 ret = 0;

2281 }

2282 }

2283 spin_unlock(&mdsc->cap_dirty_lock);

2284 return ret;

2285 }

2286

2287 /*

2288 * flush all dirty inode data to disk.

2289 *

2290 * returns true if we've flushed through want_flush_tid

2291 */

2292 static void wait_caps_flush(struct ceph_mds_client *mdsc,

2293 u64 want_flush_tid)

2294 {

2295 struct ceph_client *cl = mdsc->fsc->client;

2296

2297 doutc(cl, "want %llu\n", want_flush_tid);

2298

2299 wait_event(mdsc->cap_flushing_wq,

2300 check_caps_flush(mdsc, want_flush_tid));

2301

2302 doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);

2303 }

2304

2305 /*

2306 * called under s_mutex

2307 */

2308 static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,

2309 struct ceph_mds_session *session)

2310 {

2311 struct ceph_client *cl = mdsc->fsc->client;

2312 struct ceph_msg *msg = NULL;

2313 struct ceph_mds_cap_release *head;

2314 struct ceph_mds_cap_item *item;

2315 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;

2316 struct ceph_cap *cap;

2317 LIST_HEAD(tmp_list);

2318 int num_cap_releases;

2319 __le32 barrier, *cap_barrier;

2320

2321 down_read(&osdc->lock);

2322 barrier = cpu_to_le32(osdc->epoch_barrier);

2323 up_read(&osdc->lock);

2324

2325 spin_lock(&session->s_cap_lock);

2326 again:

2327 list_splice_init(&session->s_cap_releases, &tmp_list);

2328 num_cap_releases = session->s_num_cap_releases;

2329 session->s_num_cap_releases = 0;

2330 spin_unlock(&session->s_cap_lock);

2331

2332 while (!list_empty(&tmp_list)) {

2333 if (!msg) {

2334 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,

2335 PAGE_SIZE, GFP_NOFS, false);

2336 if (!msg)

2337 goto out_err;

2338 head = msg->front.iov_base;

2339 head->num = cpu_to_le32(0);

2340 msg->front.iov_len = sizeof(*head);

2341

2342 msg->hdr.version = cpu_to_le16(2);

2343 msg->hdr.compat_version = cpu_to_le16(1);

2344 }

2345

2346 cap = list_first_entry(&tmp_list, struct ceph_cap,

2347 session_caps);

2348 list_del(&cap->session_caps);

2349 num_cap_releases--;

2350

2351 head = msg->front.iov_base;

2352 put_unaligned_le32(get_unaligned_le32(&head->num) + 1,

2353 &head->num);

2354 item = msg->front.iov_base + msg->front.iov_len;

2355 item->ino = cpu_to_le64(cap->cap_ino);

2356 item->cap_id = cpu_to_le64(cap->cap_id);

2357 item->migrate_seq = cpu_to_le32(cap->mseq);

2358 item->issue_seq = cpu_to_le32(cap->issue_seq);

2359 msg->front.iov_len += sizeof(*item);

2360

2361 ceph_put_cap(mdsc, cap);

2362

2363 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {

2364 // Append cap_barrier field

2365 cap_barrier = msg->front.iov_base + msg->front.iov_len;

2366 *cap_barrier = barrier;

2367 msg->front.iov_len += sizeof(*cap_barrier);

2368

2369 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);

2370 doutc(cl, "mds%d %p\n", session->s_mds, msg);

2371 ceph_con_send(&session->s_con, msg);

2372 msg = NULL;

2373 }

2374 }

2375

2376 BUG_ON(num_cap_releases != 0);

2377

2378 spin_lock(&session->s_cap_lock);

2379 if (!list_empty(&session->s_cap_releases))

2380 goto again;

2381 spin_unlock(&session->s_cap_lock);

2382

2383 if (msg) {

2384 // Append cap_barrier field

2385 cap_barrier = msg->front.iov_base + msg->front.iov_len;

2386 *cap_barrier = barrier;

2387 msg->front.iov_len += sizeof(*cap_barrier);

2388

2389 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);

2390 doutc(cl, "mds%d %p\n", session->s_mds, msg);

2391 ceph_con_send(&session->s_con, msg);

2392 }

2393 return;

2394 out_err:

2395 pr_err_client(cl, "mds%d, failed to allocate message\n",

2396 session->s_mds);

2397 spin_lock(&session->s_cap_lock);

2398 list_splice(&tmp_list, &session->s_cap_releases);

2399 session->s_num_cap_releases += num_cap_releases;

2400 spin_unlock(&session->s_cap_lock);

2401 }

2402

2403 static void ceph_cap_release_work(struct work_struct *work)

2404 {

2405 struct ceph_mds_session *session =

2406 container_of(work, struct ceph_mds_session, s_cap_release_work);

2407

2408 mutex_lock(&session->s_mutex);

2409 if (session->s_state == CEPH_MDS_SESSION_OPEN ||

2410 session->s_state == CEPH_MDS_SESSION_HUNG)

2411 ceph_send_cap_releases(session->s_mdsc, session);

2412 mutex_unlock(&session->s_mutex);

2413 ceph_put_mds_session(session);

2414 }

2415

2416 void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,

2417 struct ceph_mds_session *session)

2418 {

2419 struct ceph_client *cl = mdsc->fsc->client;

2420 if (mdsc->stopping)

2421 return;

2422

2423 ceph_get_mds_session(session);

2424 if (queue_work(mdsc->fsc->cap_wq,

2425 &session->s_cap_release_work)) {

2426 doutc(cl, "cap release work queued\n");

2427 } else {

2428 ceph_put_mds_session(session);

2429 doutc(cl, "failed to queue cap release work\n");

2430 }

2431 }

2432

2433 /*

2434 * caller holds session->s_cap_lock

2435 */

2436 void __ceph_queue_cap_release(struct ceph_mds_session *session,

2437 struct ceph_cap *cap)

2438 {

2439 list_add_tail(&cap->session_caps, &session->s_cap_releases);

2440 session->s_num_cap_releases++;

2441

2442 if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))

2443 ceph_flush_session_cap_releases(session->s_mdsc, session);

2444 }

2445

2446 static void ceph_cap_reclaim_work(struct work_struct *work)

2447 {

2448 struct ceph_mds_client *mdsc =

2449 container_of(work, struct ceph_mds_client, cap_reclaim_work);

2450 int ret = ceph_trim_dentries(mdsc);

2451 if (ret == -EAGAIN)

2452 ceph_queue_cap_reclaim_work(mdsc);

2453 }

2454

2455 void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)

2456 {

2457 struct ceph_client *cl = mdsc->fsc->client;

2458 if (mdsc->stopping)

2459 return;

2460

2461 if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {

2462 doutc(cl, "caps reclaim work queued\n");

2463 } else {

2464 doutc(cl, "failed to queue caps release work\n");

2465 }

2466 }

2467

2468 void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)

2469 {

2470 int val;

2471 if (!nr)

2472 return;

2473 val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);

2474 if ((val % CEPH_CAPS_PER_RELEASE) < nr) {

2475 atomic_set(&mdsc->cap_reclaim_pending, 0);

2476 ceph_queue_cap_reclaim_work(mdsc);

2477 }

2478 }

2479

2480 void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)

2481 {

2482 struct ceph_client *cl = mdsc->fsc->client;

2483 if (mdsc->stopping)

2484 return;

2485

2486 if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {

2487 doutc(cl, "caps unlink work queued\n");

2488 } else {

2489 doutc(cl, "failed to queue caps unlink work\n");

2490 }

2491 }

2492

2493 static void ceph_cap_unlink_work(struct work_struct *work)

2494 {

2495 struct ceph_mds_client *mdsc =

2496 container_of(work, struct ceph_mds_client, cap_unlink_work);

2497 struct ceph_client *cl = mdsc->fsc->client;

2498

2499 doutc(cl, "begin\n");

2500 spin_lock(&mdsc->cap_delay_lock);

2501 while (!list_empty(&mdsc->cap_unlink_delay_list)) {

2502 struct ceph_inode_info *ci;

2503 struct inode *inode;

2504

2505 ci = list_first_entry(&mdsc->cap_unlink_delay_list,

2506 struct ceph_inode_info,

2507 i_cap_delay_list);

2508 list_del_init(&ci->i_cap_delay_list);

2509

2510 inode = igrab(&ci->netfs.inode);

2511 if (inode) {

2512 spin_unlock(&mdsc->cap_delay_lock);

2513 doutc(cl, "on %p %llx.%llx\n", inode,

2514 ceph_vinop(inode));

2515 ceph_check_caps(ci, CHECK_CAPS_FLUSH);

2516 iput(inode);

2517 spin_lock(&mdsc->cap_delay_lock);

2518 }

2519 }

2520 spin_unlock(&mdsc->cap_delay_lock);

2521 doutc(cl, "done\n");

2522 }

2523

2524 /*

2525 * requests

2526 */

2527

2528 int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,

2529 struct inode *dir)

2530 {

2531 struct ceph_inode_info *ci = ceph_inode(dir);

2532 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;

2533 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;

2534 size_t size = sizeof(struct ceph_mds_reply_dir_entry);

2535 unsigned int num_entries;

2536 u64 bytes_count;

2537 int order;

2538

2539 spin_lock(&ci->i_ceph_lock);

2540 num_entries = ci->i_files + ci->i_subdirs;

2541 spin_unlock(&ci->i_ceph_lock);

2542 num_entries = max(num_entries, 1U);

2543 num_entries = min(num_entries, opt->max_readdir);

2544

2545 bytes_count = (u64)size * num_entries;

2546 if (unlikely(bytes_count > ULONG_MAX))

2547 bytes_count = ULONG_MAX;

2548

2549 order = get_order((unsigned long)bytes_count);

2550 while (order >= 0) {

2551 rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |

2552 __GFP_NOWARN |

2553 __GFP_ZERO,

2554 order);

2555 if (rinfo->dir_entries)

2556 break;

2557 order--;

2558 }

2559 if (!rinfo->dir_entries || unlikely(order < 0))

2560 return -ENOMEM;

2561

2562 num_entries = (PAGE_SIZE << order) / size;

2563 num_entries = min(num_entries, opt->max_readdir);

2564

2565 rinfo->dir_buf_size = PAGE_SIZE << order;

2566 req->r_num_caps = num_entries + 1;

2567 req->r_args.readdir.max_entries = cpu_to_le32(num_entries);

2568 req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);

2569 return 0;

2570 }

2571

2572 /*

2573 * Create an mds request.

2574 */

2575 struct ceph_mds_request *

2576 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)

2577 {

2578 struct ceph_mds_request *req;

2579

2580 req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);

2581 if (!req)

2582 return ERR_PTR(-ENOMEM);

2583

2584 mutex_init(&req->r_fill_mutex);

2585 req->r_mdsc = mdsc;

2586 req->r_started = jiffies;

2587 req->r_start_latency = ktime_get();

2588 req->r_resend_mds = -1;

2589 INIT_LIST_HEAD(&req->r_unsafe_dir_item);

2590 INIT_LIST_HEAD(&req->r_unsafe_target_item);

2591 req->r_fmode = -1;

2592 req->r_feature_needed = -1;

2593 kref_init(&req->r_kref);

2594 RB_CLEAR_NODE(&req->r_node);

2595 INIT_LIST_HEAD(&req->r_wait);

2596 init_completion(&req->r_completion);

2597 init_completion(&req->r_safe_completion);

2598 INIT_LIST_HEAD(&req->r_unsafe_item);

2599

2600 ktime_get_coarse_real_ts64(&req->r_stamp);

2601

2602 req->r_op = op;

2603 req->r_direct_mode = mode;

2604 return req;

2605 }

2606

2607 /*

2608 * return oldest (lowest) request, tid in request tree, 0 if none.

2609 *

2610 * called under mdsc->mutex.

2611 */

2612 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)

2613 {

2614 if (RB_EMPTY_ROOT(&mdsc->request_tree))

2615 return NULL;

2616 return rb_entry(rb_first(&mdsc->request_tree),

2617 struct ceph_mds_request, r_node);

2618 }

2619

2620 static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)

2621 {

2622 return mdsc->oldest_tid;

2623 }

2624

2625 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)

2626 static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)

2627 {

2628 struct inode *dir = req->r_parent;

2629 struct dentry *dentry = req->r_dentry;

2630 const struct qstr *name = req->r_dname;

2631 u8 *cryptbuf = NULL;

2632 u32 len = 0;

2633 int ret = 0;

2634

2635 /* only encode if we have parent and dentry */

2636 if (!dir || !dentry)

2637 goto success;

2638

2639 /* No-op unless this is encrypted */

2640 if (!IS_ENCRYPTED(dir))

2641 goto success;

2642

2643 ret = ceph_fscrypt_prepare_readdir(dir);

2644 if (ret < 0)

2645 return ERR_PTR(ret);

2646

2647 /* No key? Just ignore it. */

2648 if (!fscrypt_has_encryption_key(dir))

2649 goto success;

2650

2651 if (!name)

2652 name = &dentry->d_name;

2653

2654 if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) {

2655 WARN_ON_ONCE(1);

2656 return ERR_PTR(-ENAMETOOLONG);

2657 }

2658

2659 /* No need to append altname if name is short enough */

2660 if (len <= CEPH_NOHASH_NAME_MAX) {

2661 len = 0;

2662 goto success;

2663 }

2664

2665 cryptbuf = kmalloc(len, GFP_KERNEL);

2666 if (!cryptbuf)

2667 return ERR_PTR(-ENOMEM);

2668

2669 ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len);

2670 if (ret) {

2671 kfree(cryptbuf);

2672 return ERR_PTR(ret);

2673 }

2674 success:

2675 *plen = len;

2676 return cryptbuf;

2677 }

2678 #else

2679 static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)

2680 {

2681 *plen = 0;

2682 return NULL;

2683 }

2684 #endif

2685

2686 /**

2687 * ceph_mdsc_build_path - build a path string to a given dentry

2688 * @mdsc: mds client

2689 * @dentry: dentry to which path should be built

2690 * @path_info: output path, length, base ino+snap, and freepath ownership flag

2691 * @for_wire: is this path going to be sent to the MDS?

2692 *

2693 * Build a string that represents the path to the dentry. This is mostly called

2694 * for two different purposes:

2695 *

2696 * 1) we need to build a path string to send to the MDS (for_wire == true)

2697 * 2) we need a path string for local presentation (e.g. debugfs)

2698 * (for_wire == false)

2699 *

2700 * The path is built in reverse, starting with the dentry. Walk back up toward

2701 * the root, building the path until the first non-snapped inode is reached

2702 * (for_wire) or the root inode is reached (!for_wire).

2703 *

2704 * Encode hidden .snap dirs as a double /, i.e.

2705 * foo/.snap/bar -> foo//bar

2706 */

2707 char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,

2708 struct ceph_path_info *path_info, int for_wire)

2709 {

2710 struct ceph_client *cl = mdsc->fsc->client;

2711 struct dentry *cur;

2712 struct inode *inode;

2713 char *path;

2714 int pos;

2715 unsigned seq;

2716 u64 base;

2717

2718 if (!dentry)

2719 return ERR_PTR(-EINVAL);

2720

2721 path = __getname();

2722 if (!path)

2723 return ERR_PTR(-ENOMEM);

2724 retry:

2725 pos = PATH_MAX - 1;

2726 path[pos] = '\0';

2727

2728 seq = read_seqbegin(&rename_lock);

2729 cur = dget(dentry);

2730 for (;;) {

2731 struct dentry *parent;

2732

2733 spin_lock(&cur->d_lock);

2734 inode = d_inode(cur);

2735 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {

2736 doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur);

2737 spin_unlock(&cur->d_lock);

2738 parent = dget_parent(cur);

2739 } else if (for_wire && inode && dentry != cur &&

2740 ceph_snap(inode) == CEPH_NOSNAP) {

2741 spin_unlock(&cur->d_lock);

2742 pos++; /* get rid of any prepended '/' */

2743 break;

2744 } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {

2745 pos -= cur->d_name.len;

2746 if (pos < 0) {

2747 spin_unlock(&cur->d_lock);

2748 break;

2749 }

2750 memcpy(path + pos, cur->d_name.name, cur->d_name.len);

2751 spin_unlock(&cur->d_lock);

2752 parent = dget_parent(cur);

2753 } else {

2754 int len, ret;

2755 char buf[NAME_MAX];

2756

2757 /*

2758 * Proactively copy name into buf, in case we need to

2759 * present it as-is.

2760 */

2761 memcpy(buf, cur->d_name.name, cur->d_name.len);

2762 len = cur->d_name.len;

2763 spin_unlock(&cur->d_lock);

2764 parent = dget_parent(cur);

2765

2766 ret = ceph_fscrypt_prepare_readdir(d_inode(parent));

2767 if (ret < 0) {

2768 dput(parent);

2769 dput(cur);

2770 return ERR_PTR(ret);

2771 }

2772

2773 if (fscrypt_has_encryption_key(d_inode(parent))) {

2774 len = ceph_encode_encrypted_dname(d_inode(parent),

2775 buf, len);

2776 if (len < 0) {

2777 dput(parent);

2778 dput(cur);

2779 return ERR_PTR(len);

2780 }

2781 }

2782 pos -= len;

2783 if (pos < 0) {

2784 dput(parent);

2785 break;

2786 }

2787 memcpy(path + pos, buf, len);

2788 }

2789 dput(cur);

2790 cur = parent;

2791

2792 /* Are we at the root? */

2793 if (IS_ROOT(cur))

2794 break;

2795

2796 /* Are we out of buffer? */

2797 if (--pos < 0)

2798 break;

2799

2800 path[pos] = '/';

2801 }

2802 inode = d_inode(cur);

2803 base = inode ? ceph_ino(inode) : 0;

2804 dput(cur);

2805

2806 if (read_seqretry(&rename_lock, seq))

2807 goto retry;

2808

2809 if (pos < 0) {

2810 /*

2811 * The path is longer than PATH_MAX and this function

2812 * cannot ever succeed. Creating paths that long is

2813 * possible with Ceph, but Linux cannot use them.

2814 */

2815 return ERR_PTR(-ENAMETOOLONG);

2816 }

2817

2818 /* Initialize the output structure */

2819 memset(path_info, 0, sizeof(*path_info));

2820

2821 path_info->vino.ino = base;

2822 path_info->pathlen = PATH_MAX - 1 - pos;

2823 path_info->path = path + pos;

2824 path_info->freepath = true;

2825

2826 /* Set snap from dentry if available */

2827 if (d_inode(dentry))

2828 path_info->vino.snap = ceph_snap(d_inode(dentry));

2829 else

2830 path_info->vino.snap = CEPH_NOSNAP;

2831

2832 doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),

2833 base, PATH_MAX - 1 - pos, path + pos);

2834 return path + pos;

2835 }

2836

2837 static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,

2838 struct inode *dir, struct ceph_path_info *path_info,

2839 bool parent_locked)

2840 {

2841 char *path;

2842

2843 rcu_read_lock();

2844 if (!dir)

2845 dir = d_inode_rcu(dentry->d_parent);

2846 if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&

2847 !IS_ENCRYPTED(dir)) {

2848 path_info->vino.ino = ceph_ino(dir);

2849 path_info->vino.snap = ceph_snap(dir);

2850 rcu_read_unlock();

2851 path_info->path = dentry->d_name.name;

2852 path_info->pathlen = dentry->d_name.len;

2853 path_info->freepath = false;

2854 return 0;

2855 }

2856 rcu_read_unlock();

2857 path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);

2858 if (IS_ERR(path))

2859 return PTR_ERR(path);

2860 /*

2861 * ceph_mdsc_build_path already fills path_info, including snap handling.

2862 */

2863 return 0;

2864 }

2865

2866 static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)

2867 {

2868 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);

2869 struct dentry *dentry;

2870 char *path;

2871

2872 if (ceph_snap(inode) == CEPH_NOSNAP) {

2873 path_info->vino.ino = ceph_ino(inode);

2874 path_info->vino.snap = ceph_snap(inode);

2875 path_info->pathlen = 0;

2876 path_info->freepath = false;

2877 return 0;

2878 }

2879 dentry = d_find_alias(inode);

2880 path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);

2881 dput(dentry);

2882 if (IS_ERR(path))

2883 return PTR_ERR(path);

2884 /*

2885 * ceph_mdsc_build_path already fills path_info, including snap from dentry.

2886 * Override with inode's snap since that's what this function is for.

2887 */

2888 path_info->vino.snap = ceph_snap(inode);

2889 return 0;

2890 }

2891

2892 /*

2893 * request arguments may be specified via an inode *, a dentry *, or

2894 * an explicit ino+path.

2895 */

2896 static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,

2897 struct dentry *rdentry, struct inode *rdiri,

2898 const char *rpath, u64 rino,

2899 struct ceph_path_info *path_info,

2900 bool parent_locked)

2901 {

2902 struct ceph_client *cl = mdsc->fsc->client;

2903 int r = 0;

2904

2905 /* Initialize the output structure */

2906 memset(path_info, 0, sizeof(*path_info));

2907

2908 if (rinode) {

2909 r = build_inode_path(rinode, path_info);

2910 doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),

2911 ceph_snap(rinode));

2912 } else if (rdentry) {

2913 r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);

2914 doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,

2915 path_info->pathlen, path_info->path);

2916 } else if (rpath || rino) {

2917 path_info->vino.ino = rino;

2918 path_info->vino.snap = CEPH_NOSNAP;

2919 path_info->path = rpath;

2920 path_info->pathlen = rpath ? strlen(rpath) : 0;

2921 path_info->freepath = false;

2922

2923 doutc(cl, " path %.*s\n", path_info->pathlen, rpath);

2924 }

2925

2926 return r;

2927 }

2928

2929 static void encode_mclientrequest_tail(void **p,

2930 const struct ceph_mds_request *req)

2931 {

2932 struct ceph_timespec ts;

2933 int i;

2934

2935 ceph_encode_timespec64(&ts, &req->r_stamp);

2936 ceph_encode_copy(p, &ts, sizeof(ts));

2937

2938 /* v4: gid_list */

2939 ceph_encode_32(p, req->r_cred->group_info->ngroups);

2940 for (i = 0; i < req->r_cred->group_info->ngroups; i++)

2941 ceph_encode_64(p, from_kgid(&init_user_ns,

2942 req->r_cred->group_info->gid[i]));

2943

2944 /* v5: altname */

2945 ceph_encode_32(p, req->r_altname_len);

2946 ceph_encode_copy(p, req->r_altname, req->r_altname_len);

2947

2948 /* v6: fscrypt_auth and fscrypt_file */

2949 if (req->r_fscrypt_auth) {

2950 u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);

2951

2952 ceph_encode_32(p, authlen);

2953 ceph_encode_copy(p, req->r_fscrypt_auth, authlen);

2954 } else {

2955 ceph_encode_32(p, 0);

2956 }

2957 if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {

2958 ceph_encode_32(p, sizeof(__le64));

2959 ceph_encode_64(p, req->r_fscrypt_file);

2960 } else {

2961 ceph_encode_32(p, 0);

2962 }

2963 }

2964

2965 static inline u16 mds_supported_head_version(struct ceph_mds_session *session)

2966 {

2967 if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))

2968 return 1;

2969

2970 if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))

2971 return 2;

2972

2973 return CEPH_MDS_REQUEST_HEAD_VERSION;

2974 }

2975

2976 static struct ceph_mds_request_head_legacy *

2977 find_legacy_request_head(void *p, u64 features)

2978 {

2979 bool legacy = !(features & CEPH_FEATURE_FS_BTIME);

2980 struct ceph_mds_request_head *head;

2981

2982 if (legacy)

2983 return (struct ceph_mds_request_head_legacy *)p;

2984 head = (struct ceph_mds_request_head *)p;

2985 return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;

2986 }

2987

2988 /*

2989 * called under mdsc->mutex

2990 */

2991 static struct ceph_msg *create_request_message(struct ceph_mds_session *session,

2992 struct ceph_mds_request *req,

2993 bool drop_cap_releases)

2994 {

2995 int mds = session->s_mds;

2996 struct ceph_mds_client *mdsc = session->s_mdsc;

2997 struct ceph_client *cl = mdsc->fsc->client;

2998 struct ceph_msg *msg;

2999 struct ceph_mds_request_head_legacy *lhead;

3000 struct ceph_path_info path_info1 = {0};

3001 struct ceph_path_info path_info2 = {0};

3002 struct dentry *old_dentry = NULL;

3003 int len;

3004 u16 releases;

3005 void *p, *end;

3006 int ret;

3007 bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);

3008 u16 request_head_version = mds_supported_head_version(session);

3009 kuid_t caller_fsuid = req->r_cred->fsuid;

3010 kgid_t caller_fsgid = req->r_cred->fsgid;

3011 bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);

3012

3013 ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,

3014 req->r_parent, req->r_path1, req->r_ino1.ino,

3015 &path_info1, parent_locked);

3016 if (ret < 0) {

3017 msg = ERR_PTR(ret);

3018 goto out;

3019 }

3020

3021 /*

3022 * When the parent directory's i_rwsem is *not* locked, req->r_parent may

3023 * have become stale (e.g. after a concurrent rename) between the time the

3024 * dentry was looked up and now. If we detect that the stored r_parent

3025 * does not match the inode number we just encoded for the request, switch

3026 * to the correct inode so that the MDS receives a valid parent reference.

3027 */

3028 if (!parent_locked && req->r_parent && path_info1.vino.ino &&

3029 ceph_ino(req->r_parent) != path_info1.vino.ino) {

3030 struct inode *old_parent = req->r_parent;

3031 struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);

3032 if (!IS_ERR(correct_dir)) {

3033 WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",

3034 ceph_ino(old_parent), path_info1.vino.ino);

3035 /*

3036 * Transfer CEPH_CAP_PIN from the old parent to the new one.

3037 * The pin was taken earlier in ceph_mdsc_submit_request().

3038 */

3039 ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);

3040 iput(old_parent);

3041 req->r_parent = correct_dir;

3042 ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);

3043 }

3044 }

3045

3046 /* If r_old_dentry is set, then assume that its parent is locked */

3047 if (req->r_old_dentry &&

3048 !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))

3049 old_dentry = req->r_old_dentry;

3050 ret = set_request_path_attr(mdsc, NULL, old_dentry,

3051 req->r_old_dentry_dir,

3052 req->r_path2, req->r_ino2.ino,

3053 &path_info2, true);

3054 if (ret < 0) {

3055 msg = ERR_PTR(ret);

3056 goto out_free1;

3057 }

3058

3059 req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);

3060 if (IS_ERR(req->r_altname)) {

3061 msg = ERR_CAST(req->r_altname);

3062 req->r_altname = NULL;

3063 goto out_free2;

3064 }

3065

3066 /*

3067 * For old cephs without supporting the 32bit retry/fwd feature

3068 * it will copy the raw memories directly when decoding the

3069 * requests. While new cephs will decode the head depending the

3070 * version member, so we need to make sure it will be compatible

3071 * with them both.

3072 */

3073 if (legacy)

3074 len = sizeof(struct ceph_mds_request_head_legacy);

3075 else if (request_head_version == 1)

3076 len = offsetofend(struct ceph_mds_request_head, args);

3077 else if (request_head_version == 2)

3078 len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);

3079 else

3080 len = sizeof(struct ceph_mds_request_head);

3081

3082 /* filepaths */

3083 len += 2 * (1 + sizeof(u32) + sizeof(u64));

3084 len += path_info1.pathlen + path_info2.pathlen;

3085

3086 /* cap releases */

3087 len += sizeof(struct ceph_mds_request_release) *

3088 (!!req->r_inode_drop + !!req->r_dentry_drop +

3089 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);

3090

3091 if (req->r_dentry_drop)

3092 len += path_info1.pathlen;

3093 if (req->r_old_dentry_drop)

3094 len += path_info2.pathlen;

3095

3096 /* MClientRequest tail */

3097

3098 /* req->r_stamp */

3099 len += sizeof(struct ceph_timespec);

3100

3101 /* gid list */

3102 len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);

3103

3104 /* alternate name */

3105 len += sizeof(u32) + req->r_altname_len;

3106

3107 /* fscrypt_auth */

3108 len += sizeof(u32); // fscrypt_auth

3109 if (req->r_fscrypt_auth)

3110 len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);

3111

3112 /* fscrypt_file */

3113 len += sizeof(u32);

3114 if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))

3115 len += sizeof(__le64);

3116

3117 msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);

3118 if (!msg) {

3119 msg = ERR_PTR(-ENOMEM);

3120 goto out_free2;

3121 }

3122

3123 msg->hdr.tid = cpu_to_le64(req->r_tid);

3124

3125 lhead = find_legacy_request_head(msg->front.iov_base,

3126 session->s_con.peer_features);

3127

3128 if ((req->r_mnt_idmap != &nop_mnt_idmap) &&

3129 !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {

3130 WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));

3131

3132 if (enable_unsafe_idmap) {

3133 pr_warn_once_client(cl,

3134 "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"

3135 " is not supported by MDS. UID/GID-based restrictions may"

3136 " not work properly.\n");

3137

3138 caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,

3139 VFSUIDT_INIT(req->r_cred->fsuid));

3140 caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,

3141 VFSGIDT_INIT(req->r_cred->fsgid));

3142 } else {

3143 pr_err_ratelimited_client(cl,

3144 "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"

3145 " is not supported by MDS. Fail request with -EIO.\n");

3146

3147 ret = -EIO;

3148 goto out_err;

3149 }

3150 }

3151

3152 /*

3153 * The ceph_mds_request_head_legacy didn't contain a version field, and

3154 * one was added when we moved the message version from 3->4.

3155 */

3156 if (legacy) {

3157 msg->hdr.version = cpu_to_le16(3);

3158 p = msg->front.iov_base + sizeof(*lhead);

3159 } else if (request_head_version == 1) {

3160 struct ceph_mds_request_head *nhead = msg->front.iov_base;

3161

3162 msg->hdr.version = cpu_to_le16(4);

3163 nhead->version = cpu_to_le16(1);

3164 p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args);

3165 } else if (request_head_version == 2) {

3166 struct ceph_mds_request_head *nhead = msg->front.iov_base;

3167

3168 msg->hdr.version = cpu_to_le16(6);

3169 nhead->version = cpu_to_le16(2);

3170

3171 p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);

3172 } else {

3173 struct ceph_mds_request_head *nhead = msg->front.iov_base;

3174 kuid_t owner_fsuid;

3175 kgid_t owner_fsgid;

3176

3177 msg->hdr.version = cpu_to_le16(6);

3178 nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);

3179 nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));

3180

3181 if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {

3182 owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,

3183 VFSUIDT_INIT(req->r_cred->fsuid));

3184 owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,

3185 VFSGIDT_INIT(req->r_cred->fsgid));

3186 nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));

3187 nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));

3188 } else {

3189 nhead->owner_uid = cpu_to_le32(-1);

3190 nhead->owner_gid = cpu_to_le32(-1);

3191 }

3192

3193 p = msg->front.iov_base + sizeof(*nhead);

3194 }

3195

3196 end = msg->front.iov_base + msg->front.iov_len;

3197

3198 lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);

3199 lhead->op = cpu_to_le32(req->r_op);

3200 lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,

3201 caller_fsuid));

3202 lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,

3203 caller_fsgid));

3204 lhead->ino = cpu_to_le64(req->r_deleg_ino);

3205 lhead->args = req->r_args;

3206

3207 ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);

3208 ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);

3209

3210 /* make note of release offset, in case we need to replay */

3211 req->r_request_release_offset = p - msg->front.iov_base;

3212

3213 /* cap releases */

3214 releases = 0;

3215 if (req->r_inode_drop)

3216 releases += ceph_encode_inode_release(&p,

3217 req->r_inode ? req->r_inode : d_inode(req->r_dentry),

3218 mds, req->r_inode_drop, req->r_inode_unless,

3219 req->r_op == CEPH_MDS_OP_READDIR);

3220 if (req->r_dentry_drop) {

3221 ret = ceph_encode_dentry_release(&p, req->r_dentry,

3222 req->r_parent, mds, req->r_dentry_drop,

3223 req->r_dentry_unless);

3224 if (ret < 0)

3225 goto out_err;

3226 releases += ret;

3227 }

3228 if (req->r_old_dentry_drop) {

3229 ret = ceph_encode_dentry_release(&p, req->r_old_dentry,

3230 req->r_old_dentry_dir, mds,

3231 req->r_old_dentry_drop,

3232 req->r_old_dentry_unless);

3233 if (ret < 0)

3234 goto out_err;

3235 releases += ret;

3236 }

3237 if (req->r_old_inode_drop)

3238 releases += ceph_encode_inode_release(&p,

3239 d_inode(req->r_old_dentry),

3240 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);

3241

3242 if (drop_cap_releases) {

3243 releases = 0;

3244 p = msg->front.iov_base + req->r_request_release_offset;

3245 }

3246

3247 lhead->num_releases = cpu_to_le16(releases);

3248

3249 encode_mclientrequest_tail(&p, req);

3250

3251 if (WARN_ON_ONCE(p > end)) {

3252 ceph_msg_put(msg);

3253 msg = ERR_PTR(-ERANGE);

3254 goto out_free2;

3255 }

3256

3257 msg->front.iov_len = p - msg->front.iov_base;

3258 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);

3259

3260 if (req->r_pagelist) {

3261 struct ceph_pagelist *pagelist = req->r_pagelist;

3262 ceph_msg_data_add_pagelist(msg, pagelist);

3263 msg->hdr.data_len = cpu_to_le32(pagelist->length);

3264 } else {

3265 msg->hdr.data_len = 0;

3266 }

3267

3268 msg->hdr.data_off = cpu_to_le16(0);

3269

3270 out_free2:

3271 ceph_mdsc_free_path_info(&path_info2);

3272 out_free1:

3273 ceph_mdsc_free_path_info(&path_info1);

3274 out:

3275 return msg;

3276 out_err:

3277 ceph_msg_put(msg);

3278 msg = ERR_PTR(ret);

3279 goto out_free2;

3280 }

3281

3282 /*

3283 * called under mdsc->mutex if error, under no mutex if

3284 * success.

3285 */

3286 static void complete_request(struct ceph_mds_client *mdsc,

3287 struct ceph_mds_request *req)

3288 {

3289 req->r_end_latency = ktime_get();

3290

3291 if (req->r_callback)

3292 req->r_callback(mdsc, req);

3293 complete_all(&req->r_completion);

3294 }

3295

3296 /*

3297 * called under mdsc->mutex

3298 */

3299 static int __prepare_send_request(struct ceph_mds_session *session,

3300 struct ceph_mds_request *req,

3301 bool drop_cap_releases)

3302 {

3303 int mds = session->s_mds;

3304 struct ceph_mds_client *mdsc = session->s_mdsc;

3305 struct ceph_client *cl = mdsc->fsc->client;

3306 struct ceph_mds_request_head_legacy *lhead;

3307 struct ceph_mds_request_head *nhead;

3308 struct ceph_msg *msg;

3309 int flags = 0, old_max_retry;

3310 bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,

3311 &session->s_features);

3312

3313 /*

3314 * Avoid infinite retrying after overflow. The client will

3315 * increase the retry count and if the MDS is old version,

3316 * so we limit to retry at most 256 times.

3317 */

3318 if (req->r_attempts) {

3319 old_max_retry = sizeof_field(struct ceph_mds_request_head,

3320 num_retry);

3321 old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);

3322 if ((old_version && req->r_attempts >= old_max_retry) ||

3323 ((uint32_t)req->r_attempts >= U32_MAX)) {

3324 pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",

3325 req->r_tid);

3326 return -EMULTIHOP;

3327 }

3328 }

3329

3330 req->r_attempts++;

3331 if (req->r_inode) {

3332 struct ceph_cap *cap =

3333 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);

3334

3335 if (cap)

3336 req->r_sent_on_mseq = cap->mseq;

3337 else

3338 req->r_sent_on_mseq = -1;

3339 }

3340 doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid,

3341 ceph_mds_op_name(req->r_op), req->r_attempts);

3342

3343 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {

3344 void *p;

3345

3346 /*

3347 * Replay. Do not regenerate message (and rebuild

3348 * paths, etc.); just use the original message.

3349 * Rebuilding paths will break for renames because

3350 * d_move mangles the src name.

3351 */

3352 msg = req->r_request;

3353 lhead = find_legacy_request_head(msg->front.iov_base,

3354 session->s_con.peer_features);

3355

3356 flags = le32_to_cpu(lhead->flags);

3357 flags |= CEPH_MDS_FLAG_REPLAY;

3358 lhead->flags = cpu_to_le32(flags);

3359

3360 if (req->r_target_inode)

3361 lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));

3362

3363 lhead->num_retry = req->r_attempts - 1;

3364 if (!old_version) {

3365 nhead = (struct ceph_mds_request_head*)msg->front.iov_base;

3366 nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);

3367 }

3368

3369 /* remove cap/dentry releases from message */

3370 lhead->num_releases = 0;

3371

3372 p = msg->front.iov_base + req->r_request_release_offset;

3373 encode_mclientrequest_tail(&p, req);

3374

3375 msg->front.iov_len = p - msg->front.iov_base;

3376 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);

3377 return 0;

3378 }

3379

3380 if (req->r_request) {

3381 ceph_msg_put(req->r_request);

3382 req->r_request = NULL;

3383 }

3384 msg = create_request_message(session, req, drop_cap_releases);

3385 if (IS_ERR(msg)) {

3386 req->r_err = PTR_ERR(msg);

3387 return PTR_ERR(msg);

3388 }

3389 req->r_request = msg;

3390

3391 lhead = find_legacy_request_head(msg->front.iov_base,

3392 session->s_con.peer_features);

3393 lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));

3394 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))

3395 flags |= CEPH_MDS_FLAG_REPLAY;

3396 if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))

3397 flags |= CEPH_MDS_FLAG_ASYNC;

3398 if (req->r_parent)

3399 flags |= CEPH_MDS_FLAG_WANT_DENTRY;

3400 lhead->flags = cpu_to_le32(flags);

3401 lhead->num_fwd = req->r_num_fwd;

3402 lhead->num_retry = req->r_attempts - 1;

3403 if (!old_version) {

3404 nhead = (struct ceph_mds_request_head*)msg->front.iov_base;

3405 nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);

3406 nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);

3407 }

3408

3409 doutc(cl, " r_parent = %p\n", req->r_parent);

3410 return 0;

3411 }

3412

3413 /*

3414 * called under mdsc->mutex

3415 */

3416 static int __send_request(struct ceph_mds_session *session,

3417 struct ceph_mds_request *req,

3418 bool drop_cap_releases)

3419 {

3420 int err;

3421

3422 err = __prepare_send_request(session, req, drop_cap_releases);

3423 if (!err) {

3424 ceph_msg_get(req->r_request);

3425 ceph_con_send(&session->s_con, req->r_request);

3426 }

3427

3428 return err;

3429 }

3430

3431 /*

3432 * send request, or put it on the appropriate wait list.

3433 */

3434 static void __do_request(struct ceph_mds_client *mdsc,

3435 struct ceph_mds_request *req)

3436 {

3437 struct ceph_client *cl = mdsc->fsc->client;

3438 struct ceph_mds_session *session = NULL;

3439 int mds = -1;

3440 int err = 0;

3441 bool random;

3442

3443 if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {

3444 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))

3445 __unregister_request(mdsc, req);

3446 return;

3447 }

3448

3449 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {

3450 doutc(cl, "metadata corrupted\n");

3451 err = -EIO;

3452 goto finish;

3453 }

3454 if (req->r_timeout &&

3455 time_after_eq(jiffies, req->r_started + req->r_timeout)) {

3456 doutc(cl, "timed out\n");

3457 err = -ETIMEDOUT;

3458 goto finish;

3459 }

3460 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {

3461 doutc(cl, "forced umount\n");

3462 err = -EIO;

3463 goto finish;

3464 }

3465 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {

3466 if (mdsc->mdsmap_err) {

3467 err = mdsc->mdsmap_err;

3468 doutc(cl, "mdsmap err %d\n", err);

3469 goto finish;

3470 }

3471 if (mdsc->mdsmap->m_epoch == 0) {

3472 doutc(cl, "no mdsmap, waiting for map\n");

3473 list_add(&req->r_wait, &mdsc->waiting_for_map);

3474 return;

3475 }

3476 if (!(mdsc->fsc->mount_options->flags &

3477 CEPH_MOUNT_OPT_MOUNTWAIT) &&

3478 !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {

3479 err = -EHOSTUNREACH;

3480 goto finish;

3481 }

3482 }

3483

3484 put_request_session(req);

3485

3486 mds = __choose_mds(mdsc, req, &random);

3487 if (mds < 0 ||

3488 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {

3489 if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {

3490 err = -EJUKEBOX;

3491 goto finish;

3492 }

3493 doutc(cl, "no mds or not active, waiting for map\n");

3494 list_add(&req->r_wait, &mdsc->waiting_for_map);

3495 return;

3496 }

3497

3498 /* get, open session */

3499 session = __ceph_lookup_mds_session(mdsc, mds);

3500 if (!session) {

3501 session = register_session(mdsc, mds);

3502 if (IS_ERR(session)) {

3503 err = PTR_ERR(session);

3504 goto finish;

3505 }

3506 }

3507 req->r_session = ceph_get_mds_session(session);

3508

3509 doutc(cl, "mds%d session %p state %s\n", mds, session,

3510 ceph_session_state_name(session->s_state));

3511

3512 /*

3513 * The old ceph will crash the MDSs when see unknown OPs

3514 */

3515 if (req->r_feature_needed > 0 &&

3516 !test_bit(req->r_feature_needed, &session->s_features)) {

3517 err = -EOPNOTSUPP;

3518 goto out_session;

3519 }

3520

3521 if (session->s_state != CEPH_MDS_SESSION_OPEN &&

3522 session->s_state != CEPH_MDS_SESSION_HUNG) {

3523 /*

3524 * We cannot queue async requests since the caps and delegated

3525 * inodes are bound to the session. Just return -EJUKEBOX and

3526 * let the caller retry a sync request in that case.

3527 */

3528 if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {

3529 err = -EJUKEBOX;

3530 goto out_session;

3531 }

3532

3533 /*

3534 * If the session has been REJECTED, then return a hard error,

3535 * unless it's a CLEANRECOVER mount, in which case we'll queue

3536 * it to the mdsc queue.

3537 */

3538 if (session->s_state == CEPH_MDS_SESSION_REJECTED) {

3539 if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))

3540 list_add(&req->r_wait, &mdsc->waiting_for_map);

3541 else

3542 err = -EACCES;

3543 goto out_session;

3544 }

3545

3546 if (session->s_state == CEPH_MDS_SESSION_NEW ||

3547 session->s_state == CEPH_MDS_SESSION_CLOSING) {

3548 err = __open_session(mdsc, session);

3549 if (err)

3550 goto out_session;

3551 /* retry the same mds later */

3552 if (random)

3553 req->r_resend_mds = mds;

3554 }

3555 list_add(&req->r_wait, &session->s_waiting);

3556 goto out_session;

3557 }

3558

3559 /* send request */

3560 req->r_resend_mds = -1; /* forget any previous mds hint */

3561

3562 if (req->r_request_started == 0) /* note request start time */

3563 req->r_request_started = jiffies;

3564

3565 /*

3566 * For async create we will choose the auth MDS of frag in parent

3567 * directory to send the request and usually this works fine, but

3568 * if the migrated the dirtory to another MDS before it could handle

3569 * it the request will be forwarded.

3570 *

3571 * And then the auth cap will be changed.

3572 */

3573 if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {

3574 struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);

3575 struct ceph_inode_info *ci;

3576 struct ceph_cap *cap;

3577

3578 /*

3579 * The request maybe handled very fast and the new inode

3580 * hasn't been linked to the dentry yet. We need to wait

3581 * for the ceph_finish_async_create(), which shouldn't be

3582 * stuck too long or fail in thoery, to finish when forwarding

3583 * the request.

3584 */

3585 if (!d_inode(req->r_dentry)) {

3586 err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,

3587 TASK_KILLABLE);

3588 if (err) {

3589 mutex_lock(&req->r_fill_mutex);

3590 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);

3591 mutex_unlock(&req->r_fill_mutex);

3592 goto out_session;

3593 }

3594 }

3595

3596 ci = ceph_inode(d_inode(req->r_dentry));

3597

3598 spin_lock(&ci->i_ceph_lock);

3599 cap = ci->i_auth_cap;

3600 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {

3601 doutc(cl, "session changed for auth cap %d -> %d\n",

3602 cap->session->s_mds, session->s_mds);

3603

3604 /* Remove the auth cap from old session */

3605 spin_lock(&cap->session->s_cap_lock);

3606 cap->session->s_nr_caps--;

3607 list_del_init(&cap->session_caps);

3608 spin_unlock(&cap->session->s_cap_lock);

3609

3610 /* Add the auth cap to the new session */

3611 cap->mds = mds;

3612 cap->session = session;

3613 spin_lock(&session->s_cap_lock);

3614 session->s_nr_caps++;

3615 list_add_tail(&cap->session_caps, &session->s_caps);

3616 spin_unlock(&session->s_cap_lock);

3617

3618 change_auth_cap_ses(ci, session);

3619 }

3620 spin_unlock(&ci->i_ceph_lock);

3621 }

3622

3623 err = __send_request(session, req, false);

3624

3625 out_session:

3626 ceph_put_mds_session(session);

3627 finish:

3628 if (err) {

3629 doutc(cl, "early error %d\n", err);

3630 req->r_err = err;

3631 complete_request(mdsc, req);

3632 __unregister_request(mdsc, req);

3633 }

3634 return;

3635 }

3636

3637 /*

3638 * called under mdsc->mutex

3639 */

3640 static void __wake_requests(struct ceph_mds_client *mdsc,

3641 struct list_head *head)

3642 {

3643 struct ceph_client *cl = mdsc->fsc->client;

3644 struct ceph_mds_request *req;

3645 LIST_HEAD(tmp_list);

3646

3647 list_splice_init(head, &tmp_list);

3648

3649 while (!list_empty(&tmp_list)) {

3650 req = list_entry(tmp_list.next,

3651 struct ceph_mds_request, r_wait);

3652 list_del_init(&req->r_wait);

3653 doutc(cl, " wake request %p tid %llu\n", req,

3654 req->r_tid);

3655 __do_request(mdsc, req);

3656 }

3657 }

3658

3659 /*

3660 * Wake up threads with requests pending for @mds, so that they can

3661 * resubmit their requests to a possibly different mds.

3662 */

3663 static void kick_requests(struct ceph_mds_client *mdsc, int mds)

3664 {

3665 struct ceph_client *cl = mdsc->fsc->client;

3666 struct ceph_mds_request *req;

3667 struct rb_node *p = rb_first(&mdsc->request_tree);

3668

3669 doutc(cl, "kick_requests mds%d\n", mds);

3670 while (p) {

3671 req = rb_entry(p, struct ceph_mds_request, r_node);

3672 p = rb_next(p);

3673 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))

3674 continue;

3675 if (req->r_attempts > 0)

3676 continue; /* only new requests */

3677 if (req->r_session &&

3678 req->r_session->s_mds == mds) {

3679 doutc(cl, " kicking tid %llu\n", req->r_tid);

3680 list_del_init(&req->r_wait);

3681 __do_request(mdsc, req);

3682 }

3683 }

3684 }

3685

3686 int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,

3687 struct ceph_mds_request *req)

3688 {

3689 struct ceph_client *cl = mdsc->fsc->client;

3690 int err = 0;

3691

3692 /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */

3693 if (req->r_inode)

3694 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);

3695 if (req->r_parent) {

3696 struct ceph_inode_info *ci = ceph_inode(req->r_parent);

3697 int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?

3698 CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;

3699 spin_lock(&ci->i_ceph_lock);

3700 ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);

3701 __ceph_touch_fmode(ci, mdsc, fmode);

3702 spin_unlock(&ci->i_ceph_lock);

3703 }

3704 if (req->r_old_dentry_dir)

3705 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),

3706 CEPH_CAP_PIN);

3707

3708 if (req->r_inode) {

3709 err = ceph_wait_on_async_create(req->r_inode);

3710 if (err) {

3711 doutc(cl, "wait for async create returned: %d\n", err);

3712 return err;

3713 }

3714 }

3715

3716 if (!err && req->r_old_inode) {

3717 err = ceph_wait_on_async_create(req->r_old_inode);

3718 if (err) {

3719 doutc(cl, "wait for async create returned: %d\n", err);

3720 return err;

3721 }

3722 }

3723

3724 doutc(cl, "submit_request on %p for inode %p\n", req, dir);

3725 mutex_lock(&mdsc->mutex);

3726 __register_request(mdsc, req, dir);

3727 __do_request(mdsc, req);

3728 err = req->r_err;

3729 mutex_unlock(&mdsc->mutex);

3730 return err;

3731 }

3732

3733 int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,

3734 struct ceph_mds_request *req,

3735 ceph_mds_request_wait_callback_t wait_func)

3736 {

3737 struct ceph_client *cl = mdsc->fsc->client;

3738 int err;

3739

3740 /* wait */

3741 doutc(cl, "do_request waiting\n");

3742 if (wait_func) {

3743 err = wait_func(mdsc, req);

3744 } else {

3745 long timeleft = wait_for_completion_killable_timeout(

3746 &req->r_completion,

3747 ceph_timeout_jiffies(req->r_timeout));

3748 if (timeleft > 0)

3749 err = 0;

3750 else if (!timeleft)

3751 err = -ETIMEDOUT; /* timed out */

3752 else

3753 err = timeleft; /* killed */

3754 }

3755 doutc(cl, "do_request waited, got %d\n", err);

3756 mutex_lock(&mdsc->mutex);

3757

3758 /* only abort if we didn't race with a real reply */

3759 if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {

3760 err = le32_to_cpu(req->r_reply_info.head->result);

3761 } else if (err < 0) {

3762 doutc(cl, "aborted request %lld with %d\n", req->r_tid, err);

3763

3764 /*

3765 * ensure we aren't running concurrently with

3766 * ceph_fill_trace or ceph_readdir_prepopulate, which

3767 * rely on locks (dir mutex) held by our caller.

3768 */

3769 mutex_lock(&req->r_fill_mutex);

3770 req->r_err = err;

3771 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);

3772 mutex_unlock(&req->r_fill_mutex);

3773

3774 if (req->r_parent &&

3775 (req->r_op & CEPH_MDS_OP_WRITE))

3776 ceph_invalidate_dir_request(req);

3777 } else {

3778 err = req->r_err;

3779 }

3780

3781 mutex_unlock(&mdsc->mutex);

3782 return err;

3783 }

3784

3785 /*

3786 * Synchrously perform an mds request. Take care of all of the

3787 * session setup, forwarding, retry details.

3788 */

3789 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,

3790 struct inode *dir,

3791 struct ceph_mds_request *req)

3792 {

3793 struct ceph_client *cl = mdsc->fsc->client;

3794 int err;

3795

3796 doutc(cl, "do_request on %p\n", req);

3797

3798 /* issue */

3799 err = ceph_mdsc_submit_request(mdsc, dir, req);

3800 if (!err)

3801 err = ceph_mdsc_wait_request(mdsc, req, NULL);

3802 doutc(cl, "do_request %p done, result %d\n", req, err);

3803 return err;

3804 }

3805

3806 /*

3807 * Invalidate dir's completeness, dentry lease state on an aborted MDS

3808 * namespace request.

3809 */

3810 void ceph_invalidate_dir_request(struct ceph_mds_request *req)

3811 {

3812 struct inode *dir = req->r_parent;

3813 struct inode *old_dir = req->r_old_dentry_dir;

3814 struct ceph_client *cl = req->r_mdsc->fsc->client;

3815

3816 doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n",

3817 dir, old_dir);

3818

3819 ceph_dir_clear_complete(dir);

3820 if (old_dir)

3821 ceph_dir_clear_complete(old_dir);

3822 if (req->r_dentry)

3823 ceph_invalidate_dentry_lease(req->r_dentry);

3824 if (req->r_old_dentry)

3825 ceph_invalidate_dentry_lease(req->r_old_dentry);

3826 }

3827

3828 /*

3829 * Handle mds reply.

3830 *

3831 * We take the session mutex and parse and process the reply immediately.

3832 * This preserves the logical ordering of replies, capabilities, etc., sent

3833 * by the MDS as they are applied to our local cache.

3834 */

3835 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)

3836 {

3837 struct ceph_mds_client *mdsc = session->s_mdsc;

3838 struct ceph_client *cl = mdsc->fsc->client;

3839 struct ceph_mds_request *req;

3840 struct ceph_mds_reply_head *head = msg->front.iov_base;

3841 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */

3842 struct ceph_snap_realm *realm;

3843 u64 tid;

3844 int err, result;

3845 int mds = session->s_mds;

3846 bool close_sessions = false;

3847

3848 if (msg->front.iov_len < sizeof(*head)) {

3849 pr_err_client(cl, "got corrupt (short) reply\n");

3850 ceph_msg_dump(msg);

3851 return;

3852 }

3853

3854 /* get request, session */

3855 tid = le64_to_cpu(msg->hdr.tid);

3856 mutex_lock(&mdsc->mutex);

3857 req = lookup_get_request(mdsc, tid);

3858 if (!req) {

3859 doutc(cl, "on unknown tid %llu\n", tid);

3860 mutex_unlock(&mdsc->mutex);

3861 return;

3862 }

3863 doutc(cl, "handle_reply %p\n", req);

3864

3865 /* correct session? */

3866 if (req->r_session != session) {

3867 pr_err_client(cl, "got %llu on session mds%d not mds%d\n",

3868 tid, session->s_mds,

3869 req->r_session ? req->r_session->s_mds : -1);

3870 mutex_unlock(&mdsc->mutex);

3871 goto out;

3872 }

3873

3874 /* dup? */

3875 if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||

3876 (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {

3877 pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n",

3878 head->safe ? "safe" : "unsafe", tid, mds);

3879 mutex_unlock(&mdsc->mutex);

3880 goto out;

3881 }

3882 if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {

3883 pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n",

3884 tid, mds);

3885 mutex_unlock(&mdsc->mutex);

3886 goto out;

3887 }

3888

3889 result = le32_to_cpu(head->result);

3890

3891 if (head->safe) {

3892 set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);

3893 __unregister_request(mdsc, req);

3894

3895 /* last request during umount? */

3896 if (mdsc->stopping && !__get_oldest_req(mdsc))

3897 complete_all(&mdsc->safe_umount_waiters);

3898

3899 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {

3900 /*

3901 * We already handled the unsafe response, now do the

3902 * cleanup. No need to examine the response; the MDS

3903 * doesn't include any result info in the safe

3904 * response. And even if it did, there is nothing

3905 * useful we could do with a revised return value.

3906 */

3907 doutc(cl, "got safe reply %llu, mds%d\n", tid, mds);

3908

3909 mutex_unlock(&mdsc->mutex);

3910 goto out;

3911 }

3912 } else {

3913 set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);

3914 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);

3915 }

3916

3917 doutc(cl, "tid %lld result %d\n", tid, result);

3918 if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))

3919 err = parse_reply_info(session, msg, req, (u64)-1);

3920 else

3921 err = parse_reply_info(session, msg, req,

3922 session->s_con.peer_features);

3923 mutex_unlock(&mdsc->mutex);

3924

3925 /* Must find target inode outside of mutexes to avoid deadlocks */

3926 rinfo = &req->r_reply_info;

3927 if ((err >= 0) && rinfo->head->is_target) {

3928 struct inode *in = xchg(&req->r_new_inode, NULL);

3929 struct ceph_vino tvino = {

3930 .ino = le64_to_cpu(rinfo->targeti.in->ino),

3931 .snap = le64_to_cpu(rinfo->targeti.in->snapid)

3932 };

3933

3934 /*

3935 * If we ended up opening an existing inode, discard

3936 * r_new_inode

3937 */

3938 if (req->r_op == CEPH_MDS_OP_CREATE &&

3939 !req->r_reply_info.has_create_ino) {

3940 /* This should never happen on an async create */

3941 WARN_ON_ONCE(req->r_deleg_ino);

3942 iput(in);

3943 in = NULL;

3944 }

3945

3946 in = ceph_get_inode(mdsc->fsc->sb, tvino, in);

3947 if (IS_ERR(in)) {

3948 err = PTR_ERR(in);

3949 mutex_lock(&session->s_mutex);

3950 goto out_err;

3951 }

3952 req->r_target_inode = in;

3953 }

3954

3955 mutex_lock(&session->s_mutex);

3956 if (err < 0) {

3957 pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",

3958 mds, tid);

3959 ceph_msg_dump(msg);

3960 goto out_err;

3961 }

3962

3963 /* snap trace */

3964 realm = NULL;

3965 if (rinfo->snapblob_len) {

3966 down_write(&mdsc->snap_rwsem);

3967 err = ceph_update_snap_trace(mdsc, rinfo->snapblob,

3968 rinfo->snapblob + rinfo->snapblob_len,

3969 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,

3970 &realm);

3971 if (err) {

3972 up_write(&mdsc->snap_rwsem);

3973 close_sessions = true;

3974 if (err == -EIO)

3975 ceph_msg_dump(msg);

3976 goto out_err;

3977 }

3978 downgrade_write(&mdsc->snap_rwsem);

3979 } else {

3980 down_read(&mdsc->snap_rwsem);

3981 }

3982

3983 /* insert trace into our cache */

3984 mutex_lock(&req->r_fill_mutex);

3985 current->journal_info = req;

3986 err = ceph_fill_trace(mdsc->fsc->sb, req);

3987 if (err == 0) {

3988 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||

3989 req->r_op == CEPH_MDS_OP_LSSNAP))

3990 err = ceph_readdir_prepopulate(req, req->r_session);

3991 }

3992 current->journal_info = NULL;

3993 mutex_unlock(&req->r_fill_mutex);

3994

3995 up_read(&mdsc->snap_rwsem);

3996 if (realm)

3997 ceph_put_snap_realm(mdsc, realm);

3998

3999 if (err == 0) {

4000 if (req->r_target_inode &&

4001 test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {

4002 struct ceph_inode_info *ci =

4003 ceph_inode(req->r_target_inode);

4004 spin_lock(&ci->i_unsafe_lock);

4005 list_add_tail(&req->r_unsafe_target_item,

4006 &ci->i_unsafe_iops);

4007 spin_unlock(&ci->i_unsafe_lock);

4008 }

4009

4010 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);

4011 }

4012 out_err:

4013 mutex_lock(&mdsc->mutex);

4014 if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {

4015 if (err) {

4016 req->r_err = err;

4017 } else {

4018 req->r_reply = ceph_msg_get(msg);

4019 set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);

4020 }

4021 } else {

4022 doutc(cl, "reply arrived after request %lld was aborted\n", tid);

4023 }

4024 mutex_unlock(&mdsc->mutex);

4025

4026 mutex_unlock(&session->s_mutex);

4027

4028 /* kick calling process */

4029 complete_request(mdsc, req);

4030

4031 ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,

4032 req->r_end_latency, err);

4033 out:

4034 ceph_mdsc_put_request(req);

4035

4036 /* Defer closing the sessions after s_mutex lock being released */

4037 if (close_sessions)

4038 ceph_mdsc_close_sessions(mdsc);

4039 return;

4040 }

4041

4042

4043

4044 /*

4045 * handle mds notification that our request has been forwarded.

4046 */

4047 static void handle_forward(struct ceph_mds_client *mdsc,

4048 struct ceph_mds_session *session,

4049 struct ceph_msg *msg)

4050 {

4051 struct ceph_client *cl = mdsc->fsc->client;

4052 struct ceph_mds_request *req;

4053 u64 tid = le64_to_cpu(msg->hdr.tid);

4054 u32 next_mds;

4055 u32 fwd_seq;

4056 int err = -EINVAL;

4057 void *p = msg->front.iov_base;

4058 void *end = p + msg->front.iov_len;

4059 bool aborted = false;

4060

4061 ceph_decode_need(&p, end, 2*sizeof(u32), bad);

4062 next_mds = ceph_decode_32(&p);

4063 fwd_seq = ceph_decode_32(&p);

4064

4065 mutex_lock(&mdsc->mutex);

4066 req = lookup_get_request(mdsc, tid);

4067 if (!req) {

4068 mutex_unlock(&mdsc->mutex);

4069 doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds);

4070 return; /* dup reply? */

4071 }

4072

4073 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {

4074 doutc(cl, "forward tid %llu aborted, unregistering\n", tid);

4075 __unregister_request(mdsc, req);

4076 } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {

4077 /*

4078 * Avoid infinite retrying after overflow.

4079 *

4080 * The MDS will increase the fwd count and in client side

4081 * if the num_fwd is less than the one saved in request

4082 * that means the MDS is an old version and overflowed of

4083 * 8 bits.

4084 */

4085 mutex_lock(&req->r_fill_mutex);

4086 req->r_err = -EMULTIHOP;

4087 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);

4088 mutex_unlock(&req->r_fill_mutex);

4089 aborted = true;

4090 pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n",

4091 tid);

4092 } else {

4093 /* resend. forward race not possible; mds would drop */

4094 doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);

4095 BUG_ON(req->r_err);

4096 BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));

4097 req->r_attempts = 0;

4098 req->r_num_fwd = fwd_seq;

4099 req->r_resend_mds = next_mds;

4100 put_request_session(req);

4101 __do_request(mdsc, req);

4102 }

4103 mutex_unlock(&mdsc->mutex);

4104

4105 /* kick calling process */

4106 if (aborted)

4107 complete_request(mdsc, req);

4108 ceph_mdsc_put_request(req);

4109 return;

4110

4111 bad:

4112 pr_err_client(cl, "decode error err=%d\n", err);

4113 ceph_msg_dump(msg);

4114 }

4115

4116 static int __decode_session_metadata(void **p, void *end,

4117 bool *blocklisted)

4118 {

4119 /* map<string,string> */

4120 u32 n;

4121 bool err_str;

4122 ceph_decode_32_safe(p, end, n, bad);

4123 while (n-- > 0) {

4124 u32 len;

4125 ceph_decode_32_safe(p, end, len, bad);

4126 ceph_decode_need(p, end, len, bad);

4127 err_str = !strncmp(*p, "error_string", len);

4128 *p += len;

4129 ceph_decode_32_safe(p, end, len, bad);

4130 ceph_decode_need(p, end, len, bad);

4131 /*

4132 * Match "blocklisted (blacklisted)" from newer MDSes,

4133 * or "blacklisted" from older MDSes.

4134 */

4135 if (err_str && strnstr(*p, "blacklisted", len))

4136 *blocklisted = true;

4137 *p += len;

4138 }

4139 return 0;

4140 bad:

4141 return -1;

4142 }

4143

4144 /*

4145 * handle a mds session control message

4146 */

4147 static void handle_session(struct ceph_mds_session *session,

4148 struct ceph_msg *msg)

4149 {

4150 struct ceph_mds_client *mdsc = session->s_mdsc;

4151 struct ceph_client *cl = mdsc->fsc->client;

4152 int mds = session->s_mds;

4153 int msg_version = le16_to_cpu(msg->hdr.version);

4154 void *p = msg->front.iov_base;

4155 void *end = p + msg->front.iov_len;

4156 struct ceph_mds_session_head *h;

4157 struct ceph_mds_cap_auth *cap_auths = NULL;

4158 u32 op, cap_auths_num = 0;

4159 u64 seq, features = 0;

4160 int wake = 0;

4161 bool blocklisted = false;

4162 u32 i;

4163

4164

4165 /* decode */

4166 ceph_decode_need(&p, end, sizeof(*h), bad);

4167 h = p;

4168 p += sizeof(*h);

4169

4170 op = le32_to_cpu(h->op);

4171 seq = le64_to_cpu(h->seq);

4172

4173 if (msg_version >= 3) {

4174 u32 len;

4175 /* version >= 2 and < 5, decode metadata, skip otherwise

4176 * as it's handled via flags.

4177 */

4178 if (msg_version >= 5)

4179 ceph_decode_skip_map(&p, end, string, string, bad);

4180 else if (__decode_session_metadata(&p, end, &blocklisted) < 0)

4181 goto bad;

4182

4183 /* version >= 3, feature bits */

4184 ceph_decode_32_safe(&p, end, len, bad);

4185 if (len) {

4186 ceph_decode_64_safe(&p, end, features, bad);

4187 p += len - sizeof(features);

4188 }

4189 }

4190

4191 if (msg_version >= 5) {

4192 u32 flags, len;

4193

4194 /* version >= 4 */

4195 ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */

4196 ceph_decode_32_safe(&p, end, len, bad); /* len */

4197 ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */

4198

4199 /* version >= 5, flags */

4200 ceph_decode_32_safe(&p, end, flags, bad);

4201 if (flags & CEPH_SESSION_BLOCKLISTED) {

4202 pr_warn_client(cl, "mds%d session blocklisted\n",

4203 session->s_mds);

4204 blocklisted = true;

4205 }

4206 }

4207

4208 if (msg_version >= 6) {

4209 ceph_decode_32_safe(&p, end, cap_auths_num, bad);

4210 doutc(cl, "cap_auths_num %d\n", cap_auths_num);

4211

4212 if (cap_auths_num && op != CEPH_SESSION_OPEN) {

4213 WARN_ON_ONCE(op != CEPH_SESSION_OPEN);

4214 goto skip_cap_auths;

4215 }

4216

4217 cap_auths = kcalloc(cap_auths_num,

4218 sizeof(struct ceph_mds_cap_auth),

4219 GFP_KERNEL);

4220 if (!cap_auths) {

4221 pr_err_client(cl, "No memory for cap_auths\n");

4222 return;

4223 }

4224

4225 for (i = 0; i < cap_auths_num; i++) {

4226 u32 _len, j;

4227

4228 /* struct_v, struct_compat, and struct_len in MDSCapAuth */

4229 ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);

4230

4231 /* struct_v, struct_compat, and struct_len in MDSCapMatch */

4232 ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);

4233 ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad);

4234 ceph_decode_32_safe(&p, end, _len, bad);

4235 if (_len) {

4236 cap_auths[i].match.gids = kcalloc(_len, sizeof(u32),

4237 GFP_KERNEL);

4238 if (!cap_auths[i].match.gids) {

4239 pr_err_client(cl, "No memory for gids\n");

4240 goto fail;

4241 }

4242

4243 cap_auths[i].match.num_gids = _len;

4244 for (j = 0; j < _len; j++)

4245 ceph_decode_32_safe(&p, end,

4246 cap_auths[i].match.gids[j],

4247 bad);

4248 }

4249

4250 ceph_decode_32_safe(&p, end, _len, bad);

4251 if (_len) {

4252 cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char),

4253 GFP_KERNEL);

4254 if (!cap_auths[i].match.path) {

4255 pr_err_client(cl, "No memory for path\n");

4256 goto fail;

4257 }

4258 ceph_decode_copy(&p, cap_auths[i].match.path, _len);

4259

4260 /* Remove the tailing '/' */

4261 while (_len && cap_auths[i].match.path[_len - 1] == '/') {

4262 cap_auths[i].match.path[_len - 1] = '\0';

4263 _len -= 1;

4264 }

4265 }

4266

4267 ceph_decode_32_safe(&p, end, _len, bad);

4268 if (_len) {

4269 cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char),

4270 GFP_KERNEL);

4271 if (!cap_auths[i].match.fs_name) {

4272 pr_err_client(cl, "No memory for fs_name\n");

4273 goto fail;

4274 }

4275 ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len);

4276 }

4277

4278 ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad);

4279 ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad);

4280 ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad);

4281 doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n",

4282 cap_auths[i].match.uid, cap_auths[i].match.num_gids,

4283 cap_auths[i].match.path, cap_auths[i].match.fs_name,

4284 cap_auths[i].match.root_squash,

4285 cap_auths[i].readable, cap_auths[i].writeable);

4286 }

4287 }

4288

4289 skip_cap_auths:

4290 mutex_lock(&mdsc->mutex);

4291 if (op == CEPH_SESSION_OPEN) {

4292 if (mdsc->s_cap_auths) {

4293 for (i = 0; i < mdsc->s_cap_auths_num; i++) {

4294 kfree(mdsc->s_cap_auths[i].match.gids);

4295 kfree(mdsc->s_cap_auths[i].match.path);

4296 kfree(mdsc->s_cap_auths[i].match.fs_name);

4297 }

4298 kfree(mdsc->s_cap_auths);

4299 }

4300 mdsc->s_cap_auths_num = cap_auths_num;

4301 mdsc->s_cap_auths = cap_auths;

4302 }

4303 if (op == CEPH_SESSION_CLOSE) {

4304 ceph_get_mds_session(session);

4305 __unregister_session(mdsc, session);

4306 }

4307 /* FIXME: this ttl calculation is generous */

4308 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;

4309 mutex_unlock(&mdsc->mutex);

4310

4311 mutex_lock(&session->s_mutex);

4312

4313 doutc(cl, "mds%d %s %p state %s seq %llu\n", mds,

4314 ceph_session_op_name(op), session,

4315 ceph_session_state_name(session->s_state), seq);

4316

4317 if (session->s_state == CEPH_MDS_SESSION_HUNG) {

4318 session->s_state = CEPH_MDS_SESSION_OPEN;

4319 pr_info_client(cl, "mds%d came back\n", session->s_mds);

4320 }

4321

4322 switch (op) {

4323 case CEPH_SESSION_OPEN:

4324 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)

4325 pr_info_client(cl, "mds%d reconnect success\n",

4326 session->s_mds);

4327

4328 session->s_features = features;

4329 if (session->s_state == CEPH_MDS_SESSION_OPEN) {

4330 pr_notice_client(cl, "mds%d is already opened\n",

4331 session->s_mds);

4332 } else {

4333 session->s_state = CEPH_MDS_SESSION_OPEN;

4334 renewed_caps(mdsc, session, 0);

4335 if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,

4336 &session->s_features))

4337 metric_schedule_delayed(&mdsc->metric);

4338 }

4339

4340 /*

4341 * The connection maybe broken and the session in client

4342 * side has been reinitialized, need to update the seq

4343 * anyway.

4344 */

4345 if (!session->s_seq && seq)

4346 session->s_seq = seq;

4347

4348 wake = 1;

4349 if (mdsc->stopping)

4350 __close_session(mdsc, session);

4351 break;

4352

4353 case CEPH_SESSION_RENEWCAPS:

4354 if (session->s_renew_seq == seq)

4355 renewed_caps(mdsc, session, 1);

4356 break;

4357

4358 case CEPH_SESSION_CLOSE:

4359 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)

4360 pr_info_client(cl, "mds%d reconnect denied\n",

4361 session->s_mds);

4362 session->s_state = CEPH_MDS_SESSION_CLOSED;

4363 cleanup_session_requests(mdsc, session);

4364 remove_session_caps(session);

4365 wake = 2; /* for good measure */

4366 wake_up_all(&mdsc->session_close_wq);

4367 break;

4368

4369 case CEPH_SESSION_STALE:

4370 pr_info_client(cl, "mds%d caps went stale, renewing\n",

4371 session->s_mds);

4372 atomic_inc(&session->s_cap_gen);

4373 session->s_cap_ttl = jiffies - 1;

4374 send_renew_caps(mdsc, session);

4375 break;

4376

4377 case CEPH_SESSION_RECALL_STATE:

4378 ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));

4379 break;

4380

4381 case CEPH_SESSION_FLUSHMSG:

4382 /* flush cap releases */

4383 spin_lock(&session->s_cap_lock);

4384 if (session->s_num_cap_releases)

4385 ceph_flush_session_cap_releases(mdsc, session);

4386 spin_unlock(&session->s_cap_lock);

4387

4388 send_flushmsg_ack(mdsc, session, seq);

4389 break;

4390

4391 case CEPH_SESSION_FORCE_RO:

4392 doutc(cl, "force_session_readonly %p\n", session);

4393 spin_lock(&session->s_cap_lock);

4394 session->s_readonly = true;

4395 spin_unlock(&session->s_cap_lock);

4396 wake_up_session_caps(session, FORCE_RO);

4397 break;

4398

4399 case CEPH_SESSION_REJECT:

4400 WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);

4401 pr_info_client(cl, "mds%d rejected session\n",

4402 session->s_mds);

4403 session->s_state = CEPH_MDS_SESSION_REJECTED;

4404 cleanup_session_requests(mdsc, session);

4405 remove_session_caps(session);

4406 if (blocklisted)

4407 mdsc->fsc->blocklisted = true;

4408 wake = 2; /* for good measure */

4409 break;

4410

4411 default:

4412 pr_err_client(cl, "bad op %d mds%d\n", op, mds);

4413 WARN_ON(1);

4414 }

4415

4416 mutex_unlock(&session->s_mutex);

4417 if (wake) {

4418 mutex_lock(&mdsc->mutex);

4419 __wake_requests(mdsc, &session->s_waiting);

4420 if (wake == 2)

4421 kick_requests(mdsc, mds);

4422 mutex_unlock(&mdsc->mutex);

4423 }

4424 if (op == CEPH_SESSION_CLOSE)

4425 ceph_put_mds_session(session);

4426 return;

4427

4428 bad:

4429 pr_err_client(cl, "corrupt message mds%d len %d\n", mds,

4430 (int)msg->front.iov_len);

4431 ceph_msg_dump(msg);

4432 fail:

4433 for (i = 0; i < cap_auths_num; i++) {

4434 kfree(cap_auths[i].match.gids);

4435 kfree(cap_auths[i].match.path);

4436 kfree(cap_auths[i].match.fs_name);

4437 }

4438 kfree(cap_auths);

4439 return;

4440 }

4441

4442 void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)

4443 {

4444 struct ceph_client *cl = req->r_mdsc->fsc->client;

4445 int dcaps;

4446

4447 dcaps = xchg(&req->r_dir_caps, 0);

4448 if (dcaps) {

4449 doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));

4450 ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);

4451 }

4452 }

4453

4454 void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req)

4455 {

4456 struct ceph_client *cl = req->r_mdsc->fsc->client;

4457 int dcaps;

4458

4459 dcaps = xchg(&req->r_dir_caps, 0);

4460 if (dcaps) {

4461 doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));

4462 ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps);

4463 }

4464 }

4465

4466 /*

4467 * called under session->mutex.

4468 */

4469 static void replay_unsafe_requests(struct ceph_mds_client *mdsc,

4470 struct ceph_mds_session *session)

4471 {

4472 struct ceph_mds_request *req, *nreq;

4473 struct rb_node *p;

4474

4475 doutc(mdsc->fsc->client, "mds%d\n", session->s_mds);

4476

4477 mutex_lock(&mdsc->mutex);

4478 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)

4479 __send_request(session, req, true);

4480

4481 /*

4482 * also re-send old requests when MDS enters reconnect stage. So that MDS

4483 * can process completed request in clientreplay stage.

4484 */

4485 p = rb_first(&mdsc->request_tree);

4486 while (p) {

4487 req = rb_entry(p, struct ceph_mds_request, r_node);

4488 p = rb_next(p);

4489 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))

4490 continue;

4491 if (req->r_attempts == 0)

4492 continue; /* only old requests */

4493 if (!req->r_session)

4494 continue;

4495 if (req->r_session->s_mds != session->s_mds)

4496 continue;

4497

4498 ceph_mdsc_release_dir_caps_async(req);

4499

4500 __send_request(session, req, true);

4501 }

4502 mutex_unlock(&mdsc->mutex);

4503 }

4504

4505 static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)

4506 {

4507 struct ceph_msg *reply;

4508 struct ceph_pagelist *_pagelist;

4509 struct page *page;

4510 __le32 *addr;

4511 int err = -ENOMEM;

4512

4513 if (!recon_state->allow_multi)

4514 return -ENOSPC;

4515

4516 /* can't handle message that contains both caps and realm */

4517 BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);

4518

4519 /* pre-allocate new pagelist */

4520 _pagelist = ceph_pagelist_alloc(GFP_NOFS);

4521 if (!_pagelist)

4522 return -ENOMEM;

4523

4524 reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);

4525 if (!reply)

4526 goto fail_msg;

4527

4528 /* placeholder for nr_caps */

4529 err = ceph_pagelist_encode_32(_pagelist, 0);

4530 if (err < 0)

4531 goto fail;

4532

4533 if (recon_state->nr_caps) {

4534 /* currently encoding caps */

4535 err = ceph_pagelist_encode_32(recon_state->pagelist, 0);

4536 if (err)

4537 goto fail;

4538 } else {

4539 /* placeholder for nr_realms (currently encoding relams) */

4540 err = ceph_pagelist_encode_32(_pagelist, 0);

4541 if (err < 0)

4542 goto fail;

4543 }

4544

4545 err = ceph_pagelist_encode_8(recon_state->pagelist, 1);

4546 if (err)

4547 goto fail;

4548

4549 page = list_first_entry(&recon_state->pagelist->head, struct page, lru);

4550 addr = kmap_atomic(page);

4551 if (recon_state->nr_caps) {

4552 /* currently encoding caps */

4553 *addr = cpu_to_le32(recon_state->nr_caps);

4554 } else {

4555 /* currently encoding relams */

4556 *(addr + 1) = cpu_to_le32(recon_state->nr_realms);

4557 }

4558 kunmap_atomic(addr);

4559

4560 reply->hdr.version = cpu_to_le16(5);

4561 reply->hdr.compat_version = cpu_to_le16(4);

4562

4563 reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);

4564 ceph_msg_data_add_pagelist(reply, recon_state->pagelist);

4565

4566 ceph_con_send(&recon_state->session->s_con, reply);

4567 ceph_pagelist_release(recon_state->pagelist);

4568

4569 recon_state->pagelist = _pagelist;

4570 recon_state->nr_caps = 0;

4571 recon_state->nr_realms = 0;

4572 recon_state->msg_version = 5;

4573 return 0;

4574 fail:

4575 ceph_msg_put(reply);

4576 fail_msg:

4577 ceph_pagelist_release(_pagelist);

4578 return err;

4579 }

4580

4581 static struct dentry* d_find_primary(struct inode *inode)

4582 {

4583 struct dentry *alias, *dn = NULL;

4584

4585 if (hlist_empty(&inode->i_dentry))

4586 return NULL;

4587

4588 spin_lock(&inode->i_lock);

4589 if (hlist_empty(&inode->i_dentry))

4590 goto out_unlock;

4591

4592 if (S_ISDIR(inode->i_mode)) {

4593 alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);

4594 if (!IS_ROOT(alias))

4595 dn = dget(alias);

4596 goto out_unlock;

4597 }

4598

4599 hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {

4600 spin_lock(&alias->d_lock);

4601 if (!d_unhashed(alias) &&

4602 (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {

4603 dn = dget_dlock(alias);

4604 }

4605 spin_unlock(&alias->d_lock);

4606 if (dn)

4607 break;

4608 }

4609 out_unlock:

4610 spin_unlock(&inode->i_lock);

4611 return dn;

4612 }

4613

4614 /*

4615 * Encode information about a cap for a reconnect with the MDS.

4616 */

4617 static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)

4618 {

4619 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);

4620 struct ceph_client *cl = ceph_inode_to_client(inode);

4621 union {

4622 struct ceph_mds_cap_reconnect v2;

4623 struct ceph_mds_cap_reconnect_v1 v1;

4624 } rec;

4625 struct ceph_inode_info *ci = ceph_inode(inode);

4626 struct ceph_reconnect_state *recon_state = arg;

4627 struct ceph_pagelist *pagelist = recon_state->pagelist;

4628 struct dentry *dentry;

4629 struct ceph_cap *cap;

4630 struct ceph_path_info path_info = {0};

4631 int err;

4632 u64 snap_follows;

4633

4634 dentry = d_find_primary(inode);

4635 if (dentry) {

4636 /* set pathbase to parent dir when msg_version >= 2 */

4637 char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,

4638 recon_state->msg_version >= 2);

4639 dput(dentry);

4640 if (IS_ERR(path)) {

4641 err = PTR_ERR(path);

4642 goto out_err;

4643 }

4644 }

4645

4646 spin_lock(&ci->i_ceph_lock);

4647 cap = __get_cap_for_mds(ci, mds);

4648 if (!cap) {

4649 spin_unlock(&ci->i_ceph_lock);

4650 err = 0;

4651 goto out_err;

4652 }

4653 doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode,

4654 ceph_vinop(inode), cap, cap->cap_id,

4655 ceph_cap_string(cap->issued));

4656

4657 cap->seq = 0; /* reset cap seq */

4658 cap->issue_seq = 0; /* and issue_seq */

4659 cap->mseq = 0; /* and migrate_seq */

4660 cap->cap_gen = atomic_read(&cap->session->s_cap_gen);

4661

4662 /* These are lost when the session goes away */

4663 if (S_ISDIR(inode->i_mode)) {

4664 if (cap->issued & CEPH_CAP_DIR_CREATE) {

4665 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));

4666 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));

4667 }

4668 cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;

4669 }

4670

4671 if (recon_state->msg_version >= 2) {

4672 rec.v2.cap_id = cpu_to_le64(cap->cap_id);

4673 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));

4674 rec.v2.issued = cpu_to_le32(cap->issued);

4675 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);

4676 rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);

4677 rec.v2.flock_len = (__force __le32)

4678 ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);

4679 } else {

4680 struct timespec64 ts;

4681

4682 rec.v1.cap_id = cpu_to_le64(cap->cap_id);

4683 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));

4684 rec.v1.issued = cpu_to_le32(cap->issued);

4685 rec.v1.size = cpu_to_le64(i_size_read(inode));

4686 ts = inode_get_mtime(inode);

4687 ceph_encode_timespec64(&rec.v1.mtime, &ts);

4688 ts = inode_get_atime(inode);

4689 ceph_encode_timespec64(&rec.v1.atime, &ts);

4690 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);

4691 rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);

4692 }

4693

4694 if (list_empty(&ci->i_cap_snaps)) {

4695 snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;

4696 } else {

4697 struct ceph_cap_snap *capsnap =

4698 list_first_entry(&ci->i_cap_snaps,

4699 struct ceph_cap_snap, ci_item);

4700 snap_follows = capsnap->follows;

4701 }

4702 spin_unlock(&ci->i_ceph_lock);

4703

4704 if (recon_state->msg_version >= 2) {

4705 int num_fcntl_locks, num_flock_locks;

4706 struct ceph_filelock *flocks = NULL;

4707 size_t struct_len, total_len = sizeof(u64);

4708 u8 struct_v = 0;

4709

4710 encode_again:

4711 if (rec.v2.flock_len) {

4712 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);

4713 } else {

4714 num_fcntl_locks = 0;

4715 num_flock_locks = 0;

4716 }

4717 if (num_fcntl_locks + num_flock_locks > 0) {

4718 flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,

4719 sizeof(struct ceph_filelock),

4720 GFP_NOFS);

4721 if (!flocks) {

4722 err = -ENOMEM;

4723 goto out_err;

4724 }

4725 err = ceph_encode_locks_to_buffer(inode, flocks,

4726 num_fcntl_locks,

4727 num_flock_locks);

4728 if (err) {

4729 kfree(flocks);

4730 flocks = NULL;

4731 if (err == -ENOSPC)

4732 goto encode_again;

4733 goto out_err;

4734 }

4735 } else {

4736 kfree(flocks);

4737 flocks = NULL;

4738 }

4739

4740 if (recon_state->msg_version >= 3) {

4741 /* version, compat_version and struct_len */

4742 total_len += 2 * sizeof(u8) + sizeof(u32);

4743 struct_v = 2;

4744 }

4745 /*

4746 * number of encoded locks is stable, so copy to pagelist

4747 */

4748 struct_len = 2 * sizeof(u32) +

4749 (num_fcntl_locks + num_flock_locks) *

4750 sizeof(struct ceph_filelock);

4751 rec.v2.flock_len = cpu_to_le32(struct_len);

4752

4753 struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);

4754

4755 if (struct_v >= 2)

4756 struct_len += sizeof(u64); /* snap_follows */

4757

4758 total_len += struct_len;

4759

4760 if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {

4761 err = send_reconnect_partial(recon_state);

4762 if (err)

4763 goto out_freeflocks;

4764 pagelist = recon_state->pagelist;

4765 }

4766

4767 err = ceph_pagelist_reserve(pagelist, total_len);

4768 if (err)

4769 goto out_freeflocks;

4770

4771 ceph_pagelist_encode_64(pagelist, ceph_ino(inode));

4772 if (recon_state->msg_version >= 3) {

4773 ceph_pagelist_encode_8(pagelist, struct_v);

4774 ceph_pagelist_encode_8(pagelist, 1);

4775 ceph_pagelist_encode_32(pagelist, struct_len);

4776 }

4777 ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);

4778 ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));

4779 ceph_locks_to_pagelist(flocks, pagelist,

4780 num_fcntl_locks, num_flock_locks);

4781 if (struct_v >= 2)

4782 ceph_pagelist_encode_64(pagelist, snap_follows);

4783 out_freeflocks:

4784 kfree(flocks);

4785 } else {

4786 err = ceph_pagelist_reserve(pagelist,

4787 sizeof(u64) + sizeof(u32) +

4788 path_info.pathlen + sizeof(rec.v1));

4789 if (err)

4790 goto out_err;

4791

4792 ceph_pagelist_encode_64(pagelist, ceph_ino(inode));

4793 ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);

4794 ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));

4795 }

4796

4797 out_err:

4798 ceph_mdsc_free_path_info(&path_info);

4799 if (!err)

4800 recon_state->nr_caps++;

4801 return err;

4802 }

4803

4804 static int encode_snap_realms(struct ceph_mds_client *mdsc,

4805 struct ceph_reconnect_state *recon_state)

4806 {

4807 struct rb_node *p;

4808 struct ceph_pagelist *pagelist = recon_state->pagelist;

4809 struct ceph_client *cl = mdsc->fsc->client;

4810 int err = 0;

4811

4812 if (recon_state->msg_version >= 4) {

4813 err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);

4814 if (err < 0)

4815 goto fail;

4816 }

4817

4818 /*

4819 * snaprealms. we provide mds with the ino, seq (version), and

4820 * parent for all of our realms. If the mds has any newer info,

4821 * it will tell us.

4822 */

4823 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {

4824 struct ceph_snap_realm *realm =

4825 rb_entry(p, struct ceph_snap_realm, node);

4826 struct ceph_mds_snaprealm_reconnect sr_rec;

4827

4828 if (recon_state->msg_version >= 4) {

4829 size_t need = sizeof(u8) * 2 + sizeof(u32) +

4830 sizeof(sr_rec);

4831

4832 if (pagelist->length + need > RECONNECT_MAX_SIZE) {

4833 err = send_reconnect_partial(recon_state);

4834 if (err)

4835 goto fail;

4836 pagelist = recon_state->pagelist;

4837 }

4838

4839 err = ceph_pagelist_reserve(pagelist, need);

4840 if (err)

4841 goto fail;

4842

4843 ceph_pagelist_encode_8(pagelist, 1);

4844 ceph_pagelist_encode_8(pagelist, 1);

4845 ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));

4846 }

4847

4848 doutc(cl, " adding snap realm %llx seq %lld parent %llx\n",

4849 realm->ino, realm->seq, realm->parent_ino);

4850 sr_rec.ino = cpu_to_le64(realm->ino);

4851 sr_rec.seq = cpu_to_le64(realm->seq);

4852 sr_rec.parent = cpu_to_le64(realm->parent_ino);

4853

4854 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));

4855 if (err)

4856 goto fail;

4857

4858 recon_state->nr_realms++;

4859 }

4860 fail:

4861 return err;

4862 }

4863

4864

4865 /*

4866 * If an MDS fails and recovers, clients need to reconnect in order to

4867 * reestablish shared state. This includes all caps issued through

4868 * this session _and_ the snap_realm hierarchy. Because it's not

4869 * clear which snap realms the mds cares about, we send everything we

4870 * know about.. that ensures we'll then get any new info the

4871 * recovering MDS might have.

4872 *

4873 * This is a relatively heavyweight operation, but it's rare.

4874 */

4875 static void send_mds_reconnect(struct ceph_mds_client *mdsc,

4876 struct ceph_mds_session *session)

4877 {

4878 struct ceph_client *cl = mdsc->fsc->client;

4879 struct ceph_msg *reply;

4880 int mds = session->s_mds;

4881 int err = -ENOMEM;

4882 struct ceph_reconnect_state recon_state = {

4883 .session = session,

4884 };

4885 LIST_HEAD(dispose);

4886

4887 pr_info_client(cl, "mds%d reconnect start\n", mds);

4888

4889 recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);

4890 if (!recon_state.pagelist)

4891 goto fail_nopagelist;

4892

4893 reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);

4894 if (!reply)

4895 goto fail_nomsg;

4896

4897 xa_destroy(&session->s_delegated_inos);

4898

4899 mutex_lock(&session->s_mutex);

4900 session->s_state = CEPH_MDS_SESSION_RECONNECTING;

4901 session->s_seq = 0;

4902

4903 doutc(cl, "session %p state %s\n", session,

4904 ceph_session_state_name(session->s_state));

4905

4906 atomic_inc(&session->s_cap_gen);

4907

4908 spin_lock(&session->s_cap_lock);

4909 /* don't know if session is readonly */

4910 session->s_readonly = 0;

4911 /*

4912 * notify __ceph_remove_cap() that we are composing cap reconnect.

4913 * If a cap get released before being added to the cap reconnect,

4914 * __ceph_remove_cap() should skip queuing cap release.

4915 */

4916 session->s_cap_reconnect = 1;

4917 /* drop old cap expires; we're about to reestablish that state */

4918 detach_cap_releases(session, &dispose);

4919 spin_unlock(&session->s_cap_lock);

4920 dispose_cap_releases(mdsc, &dispose);

4921

4922 /* trim unused caps to reduce MDS's cache rejoin time */

4923 if (mdsc->fsc->sb->s_root)

4924 shrink_dcache_parent(mdsc->fsc->sb->s_root);

4925

4926 ceph_con_close(&session->s_con);

4927 ceph_con_open(&session->s_con,

4928 CEPH_ENTITY_TYPE_MDS, mds,

4929 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));

4930

4931 /* replay unsafe requests */

4932 replay_unsafe_requests(mdsc, session);

4933

4934 ceph_early_kick_flushing_caps(mdsc, session);

4935

4936 down_read(&mdsc->snap_rwsem);

4937

4938 /* placeholder for nr_caps */

4939 err = ceph_pagelist_encode_32(recon_state.pagelist, 0);

4940 if (err)

4941 goto fail;

4942

4943 if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {

4944 recon_state.msg_version = 3;

4945 recon_state.allow_multi = true;

4946 } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {

4947 recon_state.msg_version = 3;

4948 } else {

4949 recon_state.msg_version = 2;

4950 }

4951 /* traverse this session's caps */

4952 err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);

4953

4954 spin_lock(&session->s_cap_lock);

4955 session->s_cap_reconnect = 0;

4956 spin_unlock(&session->s_cap_lock);

4957

4958 if (err < 0)

4959 goto fail;

4960

4961 /* check if all realms can be encoded into current message */

4962 if (mdsc->num_snap_realms) {

4963 size_t total_len =

4964 recon_state.pagelist->length +

4965 mdsc->num_snap_realms *

4966 sizeof(struct ceph_mds_snaprealm_reconnect);

4967 if (recon_state.msg_version >= 4) {

4968 /* number of realms */

4969 total_len += sizeof(u32);

4970 /* version, compat_version and struct_len */

4971 total_len += mdsc->num_snap_realms *

4972 (2 * sizeof(u8) + sizeof(u32));

4973 }

4974 if (total_len > RECONNECT_MAX_SIZE) {

4975 if (!recon_state.allow_multi) {

4976 err = -ENOSPC;

4977 goto fail;

4978 }

4979 if (recon_state.nr_caps) {

4980 err = send_reconnect_partial(&recon_state);

4981 if (err)

4982 goto fail;

4983 }

4984 recon_state.msg_version = 5;

4985 }

4986 }

4987

4988 err = encode_snap_realms(mdsc, &recon_state);

4989 if (err < 0)

4990 goto fail;

4991

4992 if (recon_state.msg_version >= 5) {

4993 err = ceph_pagelist_encode_8(recon_state.pagelist, 0);

4994 if (err < 0)

4995 goto fail;

4996 }

4997

4998 if (recon_state.nr_caps || recon_state.nr_realms) {

4999 struct page *page =

5000 list_first_entry(&recon_state.pagelist->head,

5001 struct page, lru);

5002 __le32 *addr = kmap_atomic(page);

5003 if (recon_state.nr_caps) {

5004 WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);

5005 *addr = cpu_to_le32(recon_state.nr_caps);

5006 } else if (recon_state.msg_version >= 4) {

5007 *(addr + 1) = cpu_to_le32(recon_state.nr_realms);

5008 }

5009 kunmap_atomic(addr);

5010 }

5011

5012 reply->hdr.version = cpu_to_le16(recon_state.msg_version);

5013 if (recon_state.msg_version >= 4)

5014 reply->hdr.compat_version = cpu_to_le16(4);

5015

5016 reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);

5017 ceph_msg_data_add_pagelist(reply, recon_state.pagelist);

5018

5019 ceph_con_send(&session->s_con, reply);

5020

5021 mutex_unlock(&session->s_mutex);

5022

5023 mutex_lock(&mdsc->mutex);

5024 __wake_requests(mdsc, &session->s_waiting);

5025 mutex_unlock(&mdsc->mutex);

5026

5027 up_read(&mdsc->snap_rwsem);

5028 ceph_pagelist_release(recon_state.pagelist);

5029 return;

5030

5031 fail:

5032 ceph_msg_put(reply);

5033 up_read(&mdsc->snap_rwsem);

5034 mutex_unlock(&session->s_mutex);

5035 fail_nomsg:

5036 ceph_pagelist_release(recon_state.pagelist);

5037 fail_nopagelist:

5038 pr_err_client(cl, "error %d preparing reconnect for mds%d\n",

5039 err, mds);

5040 return;

5041 }

5042

5043

5044 /*

5045 * compare old and new mdsmaps, kicking requests

5046 * and closing out old connections as necessary

5047 *

5048 * called under mdsc->mutex.

5049 */

5050 static void check_new_map(struct ceph_mds_client *mdsc,

5051 struct ceph_mdsmap *newmap,

5052 struct ceph_mdsmap *oldmap)

5053 {

5054 int i, j, err;

5055 int oldstate, newstate;

5056 struct ceph_mds_session *s;

5057 unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};

5058 struct ceph_client *cl = mdsc->fsc->client;

5059

5060 doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch);

5061

5062 if (newmap->m_info) {

5063 for (i = 0; i < newmap->possible_max_rank; i++) {

5064 for (j = 0; j < newmap->m_info[i].num_export_targets; j++)

5065 set_bit(newmap->m_info[i].export_targets[j], targets);

5066 }

5067 }

5068

5069 for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {

5070 if (!mdsc->sessions[i])

5071 continue;

5072 s = mdsc->sessions[i];

5073 oldstate = ceph_mdsmap_get_state(oldmap, i);

5074 newstate = ceph_mdsmap_get_state(newmap, i);

5075

5076 doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n",

5077 i, ceph_mds_state_name(oldstate),

5078 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",

5079 ceph_mds_state_name(newstate),

5080 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",

5081 ceph_session_state_name(s->s_state));

5082

5083 if (i >= newmap->possible_max_rank) {

5084 /* force close session for stopped mds */

5085 ceph_get_mds_session(s);

5086 __unregister_session(mdsc, s);

5087 __wake_requests(mdsc, &s->s_waiting);

5088 mutex_unlock(&mdsc->mutex);

5089

5090 mutex_lock(&s->s_mutex);

5091 cleanup_session_requests(mdsc, s);

5092 remove_session_caps(s);

5093 mutex_unlock(&s->s_mutex);

5094

5095 ceph_put_mds_session(s);

5096

5097 mutex_lock(&mdsc->mutex);

5098 kick_requests(mdsc, i);

5099 continue;

5100 }

5101

5102 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),

5103 ceph_mdsmap_get_addr(newmap, i),

5104 sizeof(struct ceph_entity_addr))) {

5105 /* just close it */

5106 mutex_unlock(&mdsc->mutex);

5107 mutex_lock(&s->s_mutex);

5108 mutex_lock(&mdsc->mutex);

5109 ceph_con_close(&s->s_con);

5110 mutex_unlock(&s->s_mutex);

5111 s->s_state = CEPH_MDS_SESSION_RESTARTING;

5112 } else if (oldstate == newstate) {

5113 continue; /* nothing new with this mds */

5114 }

5115

5116 /*

5117 * send reconnect?

5118 */

5119 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&

5120 newstate >= CEPH_MDS_STATE_RECONNECT) {

5121 mutex_unlock(&mdsc->mutex);

5122 clear_bit(i, targets);

5123 send_mds_reconnect(mdsc, s);

5124 mutex_lock(&mdsc->mutex);

5125 }

5126

5127 /*

5128 * kick request on any mds that has gone active.

5129 */

5130 if (oldstate < CEPH_MDS_STATE_ACTIVE &&

5131 newstate >= CEPH_MDS_STATE_ACTIVE) {

5132 if (oldstate != CEPH_MDS_STATE_CREATING &&

5133 oldstate != CEPH_MDS_STATE_STARTING)

5134 pr_info_client(cl, "mds%d recovery completed\n",

5135 s->s_mds);

5136 kick_requests(mdsc, i);

5137 mutex_unlock(&mdsc->mutex);

5138 mutex_lock(&s->s_mutex);

5139 mutex_lock(&mdsc->mutex);

5140 ceph_kick_flushing_caps(mdsc, s);

5141 mutex_unlock(&s->s_mutex);

5142 wake_up_session_caps(s, RECONNECT);

5143 }

5144 }

5145

5146 /*

5147 * Only open and reconnect sessions that don't exist yet.

5148 */

5149 for (i = 0; i < newmap->possible_max_rank; i++) {

5150 /*

5151 * In case the import MDS is crashed just after

5152 * the EImportStart journal is flushed, so when

5153 * a standby MDS takes over it and is replaying

5154 * the EImportStart journal the new MDS daemon

5155 * will wait the client to reconnect it, but the

5156 * client may never register/open the session yet.

5157 *

5158 * Will try to reconnect that MDS daemon if the

5159 * rank number is in the export targets array and

5160 * is the up:reconnect state.

5161 */

5162 newstate = ceph_mdsmap_get_state(newmap, i);

5163 if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)

5164 continue;

5165

5166 /*

5167 * The session maybe registered and opened by some

5168 * requests which were choosing random MDSes during

5169 * the mdsc->mutex's unlock/lock gap below in rare

5170 * case. But the related MDS daemon will just queue

5171 * that requests and be still waiting for the client's

5172 * reconnection request in up:reconnect state.

5173 */

5174 s = __ceph_lookup_mds_session(mdsc, i);

5175 if (likely(!s)) {

5176 s = __open_export_target_session(mdsc, i);

5177 if (IS_ERR(s)) {

5178 err = PTR_ERR(s);

5179 pr_err_client(cl,

5180 "failed to open export target session, err %d\n",

5181 err);

5182 continue;

5183 }

5184 }

5185 doutc(cl, "send reconnect to export target mds.%d\n", i);

5186 mutex_unlock(&mdsc->mutex);

5187 send_mds_reconnect(mdsc, s);

5188 ceph_put_mds_session(s);

5189 mutex_lock(&mdsc->mutex);

5190 }

5191

5192 for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {

5193 s = mdsc->sessions[i];

5194 if (!s)

5195 continue;

5196 if (!ceph_mdsmap_is_laggy(newmap, i))

5197 continue;

5198 if (s->s_state == CEPH_MDS_SESSION_OPEN ||

5199 s->s_state == CEPH_MDS_SESSION_HUNG ||

5200 s->s_state == CEPH_MDS_SESSION_CLOSING) {

5201 doutc(cl, " connecting to export targets of laggy mds%d\n", i);

5202 __open_export_target_sessions(mdsc, s);

5203 }

5204 }

5205 }

5206

5207

5208

5209 /*

5210 * leases

5211 */

5212

5213 /*

5214 * caller must hold session s_mutex, dentry->d_lock

5215 */

5216 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)

5217 {

5218 struct ceph_dentry_info *di = ceph_dentry(dentry);

5219

5220 ceph_put_mds_session(di->lease_session);

5221 di->lease_session = NULL;

5222 }

5223

5224 static void handle_lease(struct ceph_mds_client *mdsc,

5225 struct ceph_mds_session *session,

5226 struct ceph_msg *msg)

5227 {

5228 struct ceph_client *cl = mdsc->fsc->client;

5229 struct super_block *sb = mdsc->fsc->sb;

5230 struct inode *inode;

5231 struct dentry *parent, *dentry;

5232 struct ceph_dentry_info *di;

5233 int mds = session->s_mds;

5234 struct ceph_mds_lease *h = msg->front.iov_base;

5235 u32 seq;

5236 struct ceph_vino vino;

5237 struct qstr dname;

5238 int release = 0;

5239

5240 doutc(cl, "from mds%d\n", mds);

5241

5242 if (!ceph_inc_mds_stopping_blocker(mdsc, session))

5243 return;

5244

5245 /* decode */

5246 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))

5247 goto bad;

5248 vino.ino = le64_to_cpu(h->ino);

5249 vino.snap = CEPH_NOSNAP;

5250 seq = le32_to_cpu(h->seq);

5251 dname.len = get_unaligned_le32(h + 1);

5252 if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)

5253 goto bad;

5254 dname.name = (void *)(h + 1) + sizeof(u32);

5255

5256 /* lookup inode */

5257 inode = ceph_find_inode(sb, vino);

5258 doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action),

5259 vino.ino, inode, dname.len, dname.name);

5260

5261 mutex_lock(&session->s_mutex);

5262 if (!inode) {

5263 doutc(cl, "no inode %llx\n", vino.ino);

5264 goto release;

5265 }

5266

5267 /* dentry */

5268 parent = d_find_alias(inode);

5269 if (!parent) {

5270 doutc(cl, "no parent dentry on inode %p\n", inode);

5271 WARN_ON(1);

5272 goto release; /* hrm... */

5273 }

5274 dname.hash = full_name_hash(parent, dname.name, dname.len);

5275 dentry = d_lookup(parent, &dname);

5276 dput(parent);

5277 if (!dentry)

5278 goto release;

5279

5280 spin_lock(&dentry->d_lock);

5281 di = ceph_dentry(dentry);

5282 switch (h->action) {

5283 case CEPH_MDS_LEASE_REVOKE:

5284 if (di->lease_session == session) {

5285 if (ceph_seq_cmp(di->lease_seq, seq) > 0)

5286 h->seq = cpu_to_le32(di->lease_seq);

5287 __ceph_mdsc_drop_dentry_lease(dentry);

5288 }

5289 release = 1;

5290 break;

5291

5292 case CEPH_MDS_LEASE_RENEW:

5293 if (di->lease_session == session &&

5294 di->lease_gen == atomic_read(&session->s_cap_gen) &&

5295 di->lease_renew_from &&

5296 di->lease_renew_after == 0) {

5297 unsigned long duration =

5298 msecs_to_jiffies(le32_to_cpu(h->duration_ms));

5299

5300 di->lease_seq = seq;

5301 di->time = di->lease_renew_from + duration;

5302 di->lease_renew_after = di->lease_renew_from +

5303 (duration >> 1);

5304 di->lease_renew_from = 0;

5305 }

5306 break;

5307 }

5308 spin_unlock(&dentry->d_lock);

5309 dput(dentry);

5310

5311 if (!release)

5312 goto out;

5313

5314 release:

5315 /* let's just reuse the same message */

5316 h->action = CEPH_MDS_LEASE_REVOKE_ACK;

5317 ceph_msg_get(msg);

5318 ceph_con_send(&session->s_con, msg);

5319

5320 out:

5321 mutex_unlock(&session->s_mutex);

5322 iput(inode);

5323

5324 ceph_dec_mds_stopping_blocker(mdsc);

5325 return;

5326

5327 bad:

5328 ceph_dec_mds_stopping_blocker(mdsc);

5329

5330 pr_err_client(cl, "corrupt lease message\n");

5331 ceph_msg_dump(msg);

5332 }

5333

5334 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,

5335 struct dentry *dentry, char action,

5336 u32 seq)

5337 {

5338 struct ceph_client *cl = session->s_mdsc->fsc->client;

5339 struct ceph_msg *msg;

5340 struct ceph_mds_lease *lease;

5341 struct inode *dir;

5342 int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;

5343

5344 doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action),

5345 session->s_mds);

5346

5347 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);

5348 if (!msg)

5349 return;

5350 lease = msg->front.iov_base;

5351 lease->action = action;

5352 lease->seq = cpu_to_le32(seq);

5353

5354 spin_lock(&dentry->d_lock);

5355 dir = d_inode(dentry->d_parent);

5356 lease->ino = cpu_to_le64(ceph_ino(dir));

5357 lease->first = lease->last = cpu_to_le64(ceph_snap(dir));

5358

5359 put_unaligned_le32(dentry->d_name.len, lease + 1);

5360 memcpy((void *)(lease + 1) + 4,

5361 dentry->d_name.name, dentry->d_name.len);

5362 spin_unlock(&dentry->d_lock);

5363

5364 ceph_con_send(&session->s_con, msg);

5365 }

5366

5367 /*

5368 * lock unlock the session, to wait ongoing session activities

5369 */

5370 static void lock_unlock_session(struct ceph_mds_session *s)

5371 {

5372 mutex_lock(&s->s_mutex);

5373 mutex_unlock(&s->s_mutex);

5374 }

5375

5376 static void maybe_recover_session(struct ceph_mds_client *mdsc)

5377 {

5378 struct ceph_client *cl = mdsc->fsc->client;

5379 struct ceph_fs_client *fsc = mdsc->fsc;

5380

5381 if (!ceph_test_mount_opt(fsc, CLEANRECOVER))

5382 return;

5383

5384 if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)

5385 return;

5386

5387 if (!READ_ONCE(fsc->blocklisted))

5388 return;

5389

5390 pr_info_client(cl, "auto reconnect after blocklisted\n");

5391 ceph_force_reconnect(fsc->sb);

5392 }

5393

5394 bool check_session_state(struct ceph_mds_session *s)

5395 {

5396 struct ceph_client *cl = s->s_mdsc->fsc->client;

5397

5398 switch (s->s_state) {

5399 case CEPH_MDS_SESSION_OPEN:

5400 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {

5401 s->s_state = CEPH_MDS_SESSION_HUNG;

5402 pr_info_client(cl, "mds%d hung\n", s->s_mds);

5403 }

5404 break;

5405 case CEPH_MDS_SESSION_CLOSING:

5406 case CEPH_MDS_SESSION_NEW:

5407 case CEPH_MDS_SESSION_RESTARTING:

5408 case CEPH_MDS_SESSION_CLOSED:

5409 case CEPH_MDS_SESSION_REJECTED:

5410 return false;

5411 }

5412

5413 return true;

5414 }

5415

5416 /*

5417 * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,

5418 * then we need to retransmit that request.

5419 */

5420 void inc_session_sequence(struct ceph_mds_session *s)

5421 {

5422 struct ceph_client *cl = s->s_mdsc->fsc->client;

5423

5424 lockdep_assert_held(&s->s_mutex);

5425

5426 s->s_seq++;

5427

5428 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {

5429 int ret;

5430

5431 doutc(cl, "resending session close request for mds%d\n", s->s_mds);

5432 ret = request_close_session(s);

5433 if (ret < 0)

5434 pr_err_client(cl, "unable to close session to mds%d: %d\n",

5435 s->s_mds, ret);

5436 }

5437 }

5438

5439 /*

5440 * delayed work -- periodically trim expired leases, renew caps with mds. If

5441 * the @delay parameter is set to 0 or if it's more than 5 secs, the default

5442 * workqueue delay value of 5 secs will be used.

5443 */

5444 static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)

5445 {

5446 unsigned long max_delay = HZ * 5;

5447

5448 /* 5 secs default delay */

5449 if (!delay || (delay > max_delay))

5450 delay = max_delay;

5451 schedule_delayed_work(&mdsc->delayed_work,

5452 round_jiffies_relative(delay));

5453 }

5454

5455 static void delayed_work(struct work_struct *work)

5456 {

5457 struct ceph_mds_client *mdsc =

5458 container_of(work, struct ceph_mds_client, delayed_work.work);

5459 unsigned long delay;

5460 int renew_interval;

5461 int renew_caps;

5462 int i;

5463

5464 doutc(mdsc->fsc->client, "mdsc delayed_work\n");

5465

5466 if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)

5467 return;

5468

5469 mutex_lock(&mdsc->mutex);

5470 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;

5471 renew_caps = time_after_eq(jiffies, HZ*renew_interval +

5472 mdsc->last_renew_caps);

5473 if (renew_caps)

5474 mdsc->last_renew_caps = jiffies;

5475

5476 for (i = 0; i < mdsc->max_sessions; i++) {

5477 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);

5478 if (!s)

5479 continue;

5480

5481 if (!check_session_state(s)) {

5482 ceph_put_mds_session(s);

5483 continue;

5484 }

5485 mutex_unlock(&mdsc->mutex);

5486

5487 ceph_flush_session_cap_releases(mdsc, s);

5488

5489 mutex_lock(&s->s_mutex);

5490 if (renew_caps)

5491 send_renew_caps(mdsc, s);

5492 else

5493 ceph_con_keepalive(&s->s_con);

5494 if (s->s_state == CEPH_MDS_SESSION_OPEN ||

5495 s->s_state == CEPH_MDS_SESSION_HUNG)

5496 ceph_send_cap_releases(mdsc, s);

5497 mutex_unlock(&s->s_mutex);

5498 ceph_put_mds_session(s);

5499

5500 mutex_lock(&mdsc->mutex);

5501 }

5502 mutex_unlock(&mdsc->mutex);

5503

5504 delay = ceph_check_delayed_caps(mdsc);

5505

5506 ceph_queue_cap_reclaim_work(mdsc);

5507

5508 ceph_trim_snapid_map(mdsc);

5509

5510 maybe_recover_session(mdsc);

5511

5512 schedule_delayed(mdsc, delay);

5513 }

5514

5515 int ceph_mdsc_init(struct ceph_fs_client *fsc)

5516

5517 {

5518 struct ceph_mds_client *mdsc;

5519 int err;

5520

5521 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);

5522 if (!mdsc)

5523 return -ENOMEM;

5524 mdsc->fsc = fsc;

5525 mutex_init(&mdsc->mutex);

5526 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);

5527 if (!mdsc->mdsmap) {

5528 err = -ENOMEM;

5529 goto err_mdsc;

5530 }

5531

5532 init_completion(&mdsc->safe_umount_waiters);

5533 spin_lock_init(&mdsc->stopping_lock);

5534 atomic_set(&mdsc->stopping_blockers, 0);

5535 init_completion(&mdsc->stopping_waiter);

5536 atomic64_set(&mdsc->dirty_folios, 0);

5537 init_waitqueue_head(&mdsc->flush_end_wq);

5538 init_waitqueue_head(&mdsc->session_close_wq);

5539 INIT_LIST_HEAD(&mdsc->waiting_for_map);

5540 mdsc->quotarealms_inodes = RB_ROOT;

5541 mutex_init(&mdsc->quotarealms_inodes_mutex);

5542 init_rwsem(&mdsc->snap_rwsem);

5543 mdsc->snap_realms = RB_ROOT;

5544 INIT_LIST_HEAD(&mdsc->snap_empty);

5545 spin_lock_init(&mdsc->snap_empty_lock);

5546 mdsc->request_tree = RB_ROOT;

5547 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);

5548 mdsc->last_renew_caps = jiffies;

5549 INIT_LIST_HEAD(&mdsc->cap_delay_list);

5550 #ifdef CONFIG_DEBUG_FS

5551 INIT_LIST_HEAD(&mdsc->cap_wait_list);

5552 #endif

5553 spin_lock_init(&mdsc->cap_delay_lock);

5554 INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);

5555 INIT_LIST_HEAD(&mdsc->snap_flush_list);

5556 spin_lock_init(&mdsc->snap_flush_lock);

5557 mdsc->last_cap_flush_tid = 1;

5558 INIT_LIST_HEAD(&mdsc->cap_flush_list);

5559 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);

5560 spin_lock_init(&mdsc->cap_dirty_lock);

5561 init_waitqueue_head(&mdsc->cap_flushing_wq);

5562 INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);

5563 INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);

5564 err = ceph_metric_init(&mdsc->metric);

5565 if (err)

5566 goto err_mdsmap;

5567

5568 spin_lock_init(&mdsc->dentry_list_lock);

5569 INIT_LIST_HEAD(&mdsc->dentry_leases);

5570 INIT_LIST_HEAD(&mdsc->dentry_dir_leases);

5571

5572 ceph_caps_init(mdsc);

5573 ceph_adjust_caps_max_min(mdsc, fsc->mount_options);

5574

5575 spin_lock_init(&mdsc->snapid_map_lock);

5576 mdsc->snapid_map_tree = RB_ROOT;

5577 INIT_LIST_HEAD(&mdsc->snapid_map_lru);

5578

5579 init_rwsem(&mdsc->pool_perm_rwsem);

5580 mdsc->pool_perm_tree = RB_ROOT;

5581

5582 strscpy(mdsc->nodename, utsname()->nodename,

5583 sizeof(mdsc->nodename));

5584

5585 fsc->mdsc = mdsc;

5586 return 0;

5587

5588 err_mdsmap:

5589 kfree(mdsc->mdsmap);

5590 err_mdsc:

5591 kfree(mdsc);

5592 return err;

5593 }

5594

5595 /*

5596 * Wait for safe replies on open mds requests. If we time out, drop

5597 * all requests from the tree to avoid dangling dentry refs.

5598 */

5599 static void wait_requests(struct ceph_mds_client *mdsc)

5600 {

5601 struct ceph_client *cl = mdsc->fsc->client;

5602 struct ceph_options *opts = mdsc->fsc->client->options;

5603 struct ceph_mds_request *req;

5604

5605 mutex_lock(&mdsc->mutex);

5606 if (__get_oldest_req(mdsc)) {

5607 mutex_unlock(&mdsc->mutex);

5608

5609 doutc(cl, "waiting for requests\n");

5610 wait_for_completion_timeout(&mdsc->safe_umount_waiters,

5611 ceph_timeout_jiffies(opts->mount_timeout));

5612

5613 /* tear down remaining requests */

5614 mutex_lock(&mdsc->mutex);

5615 while ((req = __get_oldest_req(mdsc))) {

5616 doutc(cl, "timed out on tid %llu\n", req->r_tid);

5617 list_del_init(&req->r_wait);

5618 __unregister_request(mdsc, req);

5619 }

5620 }

5621 mutex_unlock(&mdsc->mutex);

5622 doutc(cl, "done\n");

5623 }

5624

5625 void send_flush_mdlog(struct ceph_mds_session *s)

5626 {

5627 struct ceph_client *cl = s->s_mdsc->fsc->client;

5628 struct ceph_msg *msg;

5629

5630 /*

5631 * Pre-luminous MDS crashes when it sees an unknown session request

5632 */

5633 if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))

5634 return;

5635

5636 mutex_lock(&s->s_mutex);

5637 doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n",

5638 s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);

5639 msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,

5640 s->s_seq);

5641 if (!msg) {

5642 pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n",

5643 s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);

5644 } else {

5645 ceph_con_send(&s->s_con, msg);

5646 }

5647 mutex_unlock(&s->s_mutex);

5648 }

5649

5650 static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,

5651 struct ceph_mds_cap_auth *auth,

5652 const struct cred *cred,

5653 char *tpath)

5654 {

5655 u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);

5656 u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);

5657 struct ceph_client *cl = mdsc->fsc->client;

5658 const char *fs_name = mdsc->fsc->mount_options->mds_namespace;

5659 const char *spath = mdsc->fsc->mount_options->server_path;

5660 bool gid_matched = false;

5661 u32 gid, tlen, len;

5662 int i, j;

5663

5664 doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n",

5665 fs_name, auth->match.fs_name ? auth->match.fs_name : "");

5666 if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) {

5667 /* fsname mismatch, try next one */

5668 return 0;

5669 }

5670

5671 doutc(cl, "match.uid %lld\n", auth->match.uid);

5672 if (auth->match.uid != MDS_AUTH_UID_ANY) {

5673 if (auth->match.uid != caller_uid)

5674 return 0;

5675 if (auth->match.num_gids) {

5676 for (i = 0; i < auth->match.num_gids; i++) {

5677 if (caller_gid == auth->match.gids[i])

5678 gid_matched = true;

5679 }

5680 if (!gid_matched && cred->group_info->ngroups) {

5681 for (i = 0; i < cred->group_info->ngroups; i++) {

5682 gid = from_kgid(&init_user_ns,

5683 cred->group_info->gid[i]);

5684 for (j = 0; j < auth->match.num_gids; j++) {

5685 if (gid == auth->match.gids[j]) {

5686 gid_matched = true;

5687 break;

5688 }

5689 }

5690 if (gid_matched)

5691 break;

5692 }

5693 }

5694 if (!gid_matched)

5695 return 0;

5696 }

5697 }

5698

5699 /* path match */

5700 if (auth->match.path) {

5701 if (!tpath)

5702 return 0;

5703

5704 tlen = strlen(tpath);

5705 len = strlen(auth->match.path);

5706 if (len) {

5707 char *_tpath = tpath;

5708 bool free_tpath = false;

5709 int m, n;

5710

5711 doutc(cl, "server path %s, tpath %s, match.path %s\n",

5712 spath, tpath, auth->match.path);

5713 if (spath && (m = strlen(spath)) != 1) {

5714 /* mount path + '/' + tpath + an extra space */

5715 n = m + 1 + tlen + 1;

5716 _tpath = kmalloc(n, GFP_NOFS);

5717 if (!_tpath)

5718 return -ENOMEM;

5719 /* remove the leading '/' */

5720 snprintf(_tpath, n, "%s/%s", spath + 1, tpath);

5721 free_tpath = true;

5722 tlen = strlen(_tpath);

5723 }

5724

5725 /*

5726 * Please note the tailing '/' for match.path has already

5727 * been removed when parsing.

5728 *

5729 * Remove the tailing '/' for the target path.

5730 */

5731 while (tlen && _tpath[tlen - 1] == '/') {

5732 _tpath[tlen - 1] = '\0';

5733 tlen -= 1;

5734 }

5735 doutc(cl, "_tpath %s\n", _tpath);

5736

5737 /*

5738 * In case first == _tpath && tlen == len:

5739 * match.path=/foo --> /foo _path=/foo --> match

5740 * match.path=/foo/ --> /foo _path=/foo --> match

5741 *

5742 * In case first == _tmatch.path && tlen > len:

5743 * match.path=/foo/ --> /foo _path=/foo/ --> match

5744 * match.path=/foo --> /foo _path=/foo/ --> match

5745 * match.path=/foo/ --> /foo _path=/foo/d --> match

5746 * match.path=/foo --> /foo _path=/food --> mismatch

5747 *

5748 * All the other cases --> mismatch

5749 */

5750 bool path_matched = true;

5751 char *first = strstr(_tpath, auth->match.path);

5752 if (first != _tpath ||

5753 (tlen > len && _tpath[len] != '/')) {

5754 path_matched = false;

5755 }

5756

5757 if (free_tpath)

5758 kfree(_tpath);

5759

5760 if (!path_matched)

5761 return 0;

5762 }

5763 }

5764

5765 doutc(cl, "matched\n");

5766 return 1;

5767 }

5768

5769 int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask)

5770 {

5771 const struct cred *cred = get_current_cred();

5772 u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);

5773 u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);

5774 struct ceph_mds_cap_auth *rw_perms_s = NULL;

5775 struct ceph_client *cl = mdsc->fsc->client;

5776 bool root_squash_perms = true;

5777 int i, err;

5778

5779 doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n",

5780 tpath, mask, caller_uid, caller_gid);

5781

5782 for (i = 0; i < mdsc->s_cap_auths_num; i++) {

5783 struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i];

5784

5785 err = ceph_mds_auth_match(mdsc, s, cred, tpath);

5786 if (err < 0) {

5787 put_cred(cred);

5788 return err;

5789 } else if (err > 0) {

5790 /* always follow the last auth caps' permission */

5791 root_squash_perms = true;

5792 rw_perms_s = NULL;

5793 if ((mask & MAY_WRITE) && s->writeable &&

5794 s->match.root_squash && (!caller_uid || !caller_gid))

5795 root_squash_perms = false;

5796

5797 if (((mask & MAY_WRITE) && !s->writeable) ||

5798 ((mask & MAY_READ) && !s->readable))

5799 rw_perms_s = s;

5800 }

5801 }

5802

5803 put_cred(cred);

5804

5805 doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms,

5806 rw_perms_s);

5807 if (root_squash_perms && rw_perms_s == NULL) {

5808 doutc(cl, "access allowed\n");

5809 return 0;

5810 }

5811

5812 if (!root_squash_perms) {

5813 doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write",

5814 caller_uid, caller_gid);

5815 }

5816 if (rw_perms_s) {

5817 doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d",

5818 rw_perms_s->readable, rw_perms_s->writeable,

5819 !!(mask & MAY_READ), !!(mask & MAY_WRITE));

5820 }

5821 doutc(cl, "access denied\n");

5822 return -EACCES;

5823 }

5824

5825 /*

5826 * called before mount is ro, and before dentries are torn down.

5827 * (hmm, does this still race with new lookups?)

5828 */

5829 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)

5830 {

5831 doutc(mdsc->fsc->client, "begin\n");

5832 mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;

5833

5834 ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);

5835 ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);

5836 ceph_flush_dirty_caps(mdsc);

5837 wait_requests(mdsc);

5838

5839 /*

5840 * wait for reply handlers to drop their request refs and

5841 * their inode/dcache refs

5842 */

5843 ceph_msgr_flush();

5844

5845 ceph_cleanup_quotarealms_inodes(mdsc);

5846 doutc(mdsc->fsc->client, "done\n");

5847 }

5848

5849 /*

5850 * flush the mdlog and wait for all write mds requests to flush.

5851 */

5852 static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,

5853 u64 want_tid)

5854 {

5855 struct ceph_client *cl = mdsc->fsc->client;

5856 struct ceph_mds_request *req = NULL, *nextreq;

5857 struct ceph_mds_session *last_session = NULL;

5858 struct rb_node *n;

5859

5860 mutex_lock(&mdsc->mutex);

5861 doutc(cl, "want %lld\n", want_tid);

5862 restart:

5863 req = __get_oldest_req(mdsc);

5864 while (req && req->r_tid <= want_tid) {

5865 /* find next request */

5866 n = rb_next(&req->r_node);

5867 if (n)

5868 nextreq = rb_entry(n, struct ceph_mds_request, r_node);

5869 else

5870 nextreq = NULL;

5871 if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&

5872 (req->r_op & CEPH_MDS_OP_WRITE)) {

5873 struct ceph_mds_session *s = req->r_session;

5874

5875 if (!s) {

5876 req = nextreq;

5877 continue;

5878 }

5879

5880 /* write op */

5881 ceph_mdsc_get_request(req);

5882 if (nextreq)

5883 ceph_mdsc_get_request(nextreq);

5884 s = ceph_get_mds_session(s);

5885 mutex_unlock(&mdsc->mutex);

5886

5887 /* send flush mdlog request to MDS */

5888 if (last_session != s) {

5889 send_flush_mdlog(s);

5890 ceph_put_mds_session(last_session);

5891 last_session = s;

5892 } else {

5893 ceph_put_mds_session(s);

5894 }

5895 doutc(cl, "wait on %llu (want %llu)\n",

5896 req->r_tid, want_tid);

5897 wait_for_completion(&req->r_safe_completion);

5898

5899 mutex_lock(&mdsc->mutex);

5900 ceph_mdsc_put_request(req);

5901 if (!nextreq)

5902 break; /* next dne before, so we're done! */

5903 if (RB_EMPTY_NODE(&nextreq->r_node)) {

5904 /* next request was removed from tree */

5905 ceph_mdsc_put_request(nextreq);

5906 goto restart;

5907 }

5908 ceph_mdsc_put_request(nextreq); /* won't go away */

5909 }

5910 req = nextreq;

5911 }

5912 mutex_unlock(&mdsc->mutex);

5913 ceph_put_mds_session(last_session);

5914 doutc(cl, "done\n");

5915 }

5916

5917 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)

5918 {

5919 struct ceph_client *cl = mdsc->fsc->client;

5920 u64 want_tid, want_flush;

5921

5922 if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)

5923 return;

5924

5925 doutc(cl, "sync\n");

5926 mutex_lock(&mdsc->mutex);

5927 want_tid = mdsc->last_tid;

5928 mutex_unlock(&mdsc->mutex);

5929

5930 ceph_flush_dirty_caps(mdsc);

5931 ceph_flush_cap_releases(mdsc);

5932 spin_lock(&mdsc->cap_dirty_lock);

5933 want_flush = mdsc->last_cap_flush_tid;

5934 if (!list_empty(&mdsc->cap_flush_list)) {

5935 struct ceph_cap_flush *cf =

5936 list_last_entry(&mdsc->cap_flush_list,

5937 struct ceph_cap_flush, g_list);

5938 cf->wake = true;

5939 }

5940 spin_unlock(&mdsc->cap_dirty_lock);

5941

5942 doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);

5943

5944 flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);

5945 wait_caps_flush(mdsc, want_flush);

5946 }

5947

5948 /*

5949 * true if all sessions are closed, or we force unmount

5950 */

5951 static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)

5952 {

5953 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)

5954 return true;

5955 return atomic_read(&mdsc->num_sessions) <= skipped;

5956 }

5957

5958 /*

5959 * called after sb is ro or when metadata corrupted.

5960 */

5961 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)

5962 {

5963 struct ceph_options *opts = mdsc->fsc->client->options;

5964 struct ceph_client *cl = mdsc->fsc->client;

5965 struct ceph_mds_session *session;

5966 int i;

5967 int skipped = 0;

5968

5969 doutc(cl, "begin\n");

5970

5971 /* close sessions */

5972 mutex_lock(&mdsc->mutex);

5973 for (i = 0; i < mdsc->max_sessions; i++) {

5974 session = __ceph_lookup_mds_session(mdsc, i);

5975 if (!session)

5976 continue;

5977 mutex_unlock(&mdsc->mutex);

5978 mutex_lock(&session->s_mutex);

5979 if (__close_session(mdsc, session) <= 0)

5980 skipped++;

5981 mutex_unlock(&session->s_mutex);

5982 ceph_put_mds_session(session);

5983 mutex_lock(&mdsc->mutex);

5984 }

5985 mutex_unlock(&mdsc->mutex);

5986

5987 doutc(cl, "waiting for sessions to close\n");

5988 wait_event_timeout(mdsc->session_close_wq,

5989 done_closing_sessions(mdsc, skipped),

5990 ceph_timeout_jiffies(opts->mount_timeout));

5991

5992 /* tear down remaining sessions */

5993 mutex_lock(&mdsc->mutex);

5994 for (i = 0; i < mdsc->max_sessions; i++) {

5995 if (mdsc->sessions[i]) {

5996 session = ceph_get_mds_session(mdsc->sessions[i]);

5997 __unregister_session(mdsc, session);

5998 mutex_unlock(&mdsc->mutex);

5999 mutex_lock(&session->s_mutex);

6000 remove_session_caps(session);

6001 mutex_unlock(&session->s_mutex);

6002 ceph_put_mds_session(session);

6003 mutex_lock(&mdsc->mutex);

6004 }

6005 }

6006 WARN_ON(!list_empty(&mdsc->cap_delay_list));

6007 mutex_unlock(&mdsc->mutex);

6008

6009 ceph_cleanup_snapid_map(mdsc);

6010 ceph_cleanup_global_and_empty_realms(mdsc);

6011

6012 cancel_work_sync(&mdsc->cap_reclaim_work);

6013 cancel_work_sync(&mdsc->cap_unlink_work);

6014 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */

6015

6016 doutc(cl, "done\n");

6017 }

6018

6019 void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)

6020 {

6021 struct ceph_mds_session *session;

6022 int mds;

6023

6024 doutc(mdsc->fsc->client, "force umount\n");

6025

6026 mutex_lock(&mdsc->mutex);

6027 for (mds = 0; mds < mdsc->max_sessions; mds++) {

6028 session = __ceph_lookup_mds_session(mdsc, mds);

6029 if (!session)

6030 continue;

6031

6032 if (session->s_state == CEPH_MDS_SESSION_REJECTED)

6033 __unregister_session(mdsc, session);

6034 __wake_requests(mdsc, &session->s_waiting);

6035 mutex_unlock(&mdsc->mutex);

6036

6037 mutex_lock(&session->s_mutex);

6038 __close_session(mdsc, session);

6039 if (session->s_state == CEPH_MDS_SESSION_CLOSING) {

6040 cleanup_session_requests(mdsc, session);

6041 remove_session_caps(session);

6042 }

6043 mutex_unlock(&session->s_mutex);

6044 ceph_put_mds_session(session);

6045

6046 mutex_lock(&mdsc->mutex);

6047 kick_requests(mdsc, mds);

6048 }

6049 __wake_requests(mdsc, &mdsc->waiting_for_map);

6050 mutex_unlock(&mdsc->mutex);

6051 }

6052

6053 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)

6054 {

6055 doutc(mdsc->fsc->client, "stop\n");

6056 /*

6057 * Make sure the delayed work stopped before releasing

6058 * the resources.

6059 *

6060 * Because the cancel_delayed_work_sync() will only

6061 * guarantee that the work finishes executing. But the

6062 * delayed work will re-arm itself again after that.

6063 */

6064 flush_delayed_work(&mdsc->delayed_work);

6065

6066 if (mdsc->mdsmap)

6067 ceph_mdsmap_destroy(mdsc->mdsmap);

6068 kfree(mdsc->sessions);

6069 ceph_caps_finalize(mdsc);

6070

6071 if (mdsc->s_cap_auths) {

6072 int i;

6073

6074 for (i = 0; i < mdsc->s_cap_auths_num; i++) {

6075 kfree(mdsc->s_cap_auths[i].match.gids);

6076 kfree(mdsc->s_cap_auths[i].match.path);

6077 kfree(mdsc->s_cap_auths[i].match.fs_name);

6078 }

6079 kfree(mdsc->s_cap_auths);

6080 }

6081

6082 ceph_pool_perm_destroy(mdsc);

6083 }

6084

6085 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)

6086 {

6087 struct ceph_mds_client *mdsc = fsc->mdsc;

6088 doutc(fsc->client, "%p\n", mdsc);

6089

6090 if (!mdsc)

6091 return;

6092

6093 /* flush out any connection work with references to us */

6094 ceph_msgr_flush();

6095

6096 ceph_mdsc_stop(mdsc);

6097

6098 ceph_metric_destroy(&mdsc->metric);

6099

6100 fsc->mdsc = NULL;

6101 kfree(mdsc);

6102 doutc(fsc->client, "%p done\n", mdsc);

6103 }

6104

6105 void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)

6106 {

6107 struct ceph_fs_client *fsc = mdsc->fsc;

6108 struct ceph_client *cl = fsc->client;

6109 const char *mds_namespace = fsc->mount_options->mds_namespace;

6110 void *p = msg->front.iov_base;

6111 void *end = p + msg->front.iov_len;

6112 u32 epoch;

6113 u32 num_fs;

6114 u32 mount_fscid = (u32)-1;

6115 int err = -EINVAL;

6116

6117 ceph_decode_need(&p, end, sizeof(u32), bad);

6118 epoch = ceph_decode_32(&p);

6119

6120 doutc(cl, "epoch %u\n", epoch);

6121

6122 /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */

6123 ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);

6124

6125 ceph_decode_32_safe(&p, end, num_fs, bad);

6126 while (num_fs-- > 0) {

6127 void *info_p, *info_end;

6128 u32 info_len;

6129 u32 fscid, namelen;

6130

6131 ceph_decode_need(&p, end, 2 + sizeof(u32), bad);

6132 p += 2; // info_v, info_cv

6133 info_len = ceph_decode_32(&p);

6134 ceph_decode_need(&p, end, info_len, bad);

6135 info_p = p;

6136 info_end = p + info_len;

6137 p = info_end;

6138

6139 ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);

6140 fscid = ceph_decode_32(&info_p);

6141 namelen = ceph_decode_32(&info_p);

6142 ceph_decode_need(&info_p, info_end, namelen, bad);

6143

6144 if (mds_namespace &&

6145 strlen(mds_namespace) == namelen &&

6146 !strncmp(mds_namespace, (char *)info_p, namelen)) {

6147 mount_fscid = fscid;

6148 break;

6149 }

6150 }

6151

6152 ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);

6153 if (mount_fscid != (u32)-1) {

6154 fsc->client->monc.fs_cluster_id = mount_fscid;

6155 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,

6156 0, true);

6157 ceph_monc_renew_subs(&fsc->client->monc);

6158 } else {

6159 err = -ENOENT;

6160 goto err_out;

6161 }

6162 return;

6163

6164 bad:

6165 pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n",

6166 err);

6167 ceph_umount_begin(mdsc->fsc->sb);

6168 ceph_msg_dump(msg);

6169 err_out:

6170 mutex_lock(&mdsc->mutex);

6171 mdsc->mdsmap_err = err;

6172 __wake_requests(mdsc, &mdsc->waiting_for_map);

6173 mutex_unlock(&mdsc->mutex);

6174 }

6175

6176 /*

6177 * handle mds map update.

6178 */

6179 void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)

6180 {

6181 struct ceph_client *cl = mdsc->fsc->client;

6182 u32 epoch;

6183 u32 maplen;

6184 void *p = msg->front.iov_base;

6185 void *end = p + msg->front.iov_len;

6186 struct ceph_mdsmap *newmap, *oldmap;

6187 struct ceph_fsid fsid;

6188 int err = -EINVAL;

6189

6190 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);

6191 ceph_decode_copy(&p, &fsid, sizeof(fsid));

6192 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)

6193 return;

6194 epoch = ceph_decode_32(&p);

6195 maplen = ceph_decode_32(&p);

6196 doutc(cl, "epoch %u len %d\n", epoch, (int)maplen);

6197

6198 /* do we need it? */

6199 mutex_lock(&mdsc->mutex);

6200 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {

6201 doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch);

6202 mutex_unlock(&mdsc->mutex);

6203 return;

6204 }

6205

6206 newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));

6207 if (IS_ERR(newmap)) {

6208 err = PTR_ERR(newmap);

6209 goto bad_unlock;

6210 }

6211

6212 /* swap into place */

6213 if (mdsc->mdsmap) {

6214 oldmap = mdsc->mdsmap;

6215 mdsc->mdsmap = newmap;

6216 check_new_map(mdsc, newmap, oldmap);

6217 ceph_mdsmap_destroy(oldmap);

6218 } else {

6219 mdsc->mdsmap = newmap; /* first mds map */

6220 }

6221 mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,

6222 MAX_LFS_FILESIZE);

6223

6224 __wake_requests(mdsc, &mdsc->waiting_for_map);

6225 ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,

6226 mdsc->mdsmap->m_epoch);

6227

6228 mutex_unlock(&mdsc->mutex);

6229 schedule_delayed(mdsc, 0);

6230 return;

6231

6232 bad_unlock:

6233 mutex_unlock(&mdsc->mutex);

6234 bad:

6235 pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n",

6236 err);

6237 ceph_umount_begin(mdsc->fsc->sb);

6238 ceph_msg_dump(msg);

6239 return;

6240 }

6241

6242 static struct ceph_connection *mds_get_con(struct ceph_connection *con)

6243 {

6244 struct ceph_mds_session *s = con->private;

6245

6246 if (ceph_get_mds_session(s))

6247 return con;

6248 return NULL;

6249 }

6250

6251 static void mds_put_con(struct ceph_connection *con)

6252 {

6253 struct ceph_mds_session *s = con->private;

6254

6255 ceph_put_mds_session(s);

6256 }

6257

6258 /*

6259 * if the client is unresponsive for long enough, the mds will kill

6260 * the session entirely.

6261 */

6262 static void mds_peer_reset(struct ceph_connection *con)

6263 {

6264 struct ceph_mds_session *s = con->private;

6265 struct ceph_mds_client *mdsc = s->s_mdsc;

6266

6267 pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",

6268 s->s_mds);

6269 if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&

6270 ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)

6271 send_mds_reconnect(mdsc, s);

6272 }

6273

6274 static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)

6275 {

6276 struct ceph_mds_session *s = con->private;

6277 struct ceph_mds_client *mdsc = s->s_mdsc;

6278 struct ceph_client *cl = mdsc->fsc->client;

6279 int type = le16_to_cpu(msg->hdr.type);

6280

6281 mutex_lock(&mdsc->mutex);

6282 if (__verify_registered_session(mdsc, s) < 0) {

6283 mutex_unlock(&mdsc->mutex);

6284 goto out;

6285 }

6286 mutex_unlock(&mdsc->mutex);

6287

6288 switch (type) {

6289 case CEPH_MSG_MDS_MAP:

6290 ceph_mdsc_handle_mdsmap(mdsc, msg);

6291 break;

6292 case CEPH_MSG_FS_MAP_USER:

6293 ceph_mdsc_handle_fsmap(mdsc, msg);

6294 break;

6295 case CEPH_MSG_CLIENT_SESSION:

6296 handle_session(s, msg);

6297 break;

6298 case CEPH_MSG_CLIENT_REPLY:

6299 handle_reply(s, msg);

6300 break;

6301 case CEPH_MSG_CLIENT_REQUEST_FORWARD:

6302 handle_forward(mdsc, s, msg);

6303 break;

6304 case CEPH_MSG_CLIENT_CAPS:

6305 ceph_handle_caps(s, msg);

6306 break;

6307 case CEPH_MSG_CLIENT_SNAP:

6308 ceph_handle_snap(mdsc, s, msg);

6309 break;

6310 case CEPH_MSG_CLIENT_LEASE:

6311 handle_lease(mdsc, s, msg);

6312 break;

6313 case CEPH_MSG_CLIENT_QUOTA:

6314 ceph_handle_quota(mdsc, s, msg);

6315 break;

6316

6317 default:

6318 pr_err_client(cl, "received unknown message type %d %s\n",

6319 type, ceph_msg_type_name(type));

6320 }

6321 out:

6322 ceph_msg_put(msg);

6323 }

6324

6325 /*

6326 * authentication

6327 */

6328

6329 /*

6330 * Note: returned pointer is the address of a structure that's

6331 * managed separately. Caller must *not* attempt to free it.

6332 */

6333 static struct ceph_auth_handshake *

6334 mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)

6335 {

6336 struct ceph_mds_session *s = con->private;

6337 struct ceph_mds_client *mdsc = s->s_mdsc;

6338 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;

6339 struct ceph_auth_handshake *auth = &s->s_auth;

6340 int ret;

6341

6342 ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,

6343 force_new, proto, NULL, NULL);

6344 if (ret)

6345 return ERR_PTR(ret);

6346

6347 return auth;

6348 }

6349

6350 static int mds_add_authorizer_challenge(struct ceph_connection *con,

6351 void *challenge_buf, int challenge_buf_len)

6352 {

6353 struct ceph_mds_session *s = con->private;

6354 struct ceph_mds_client *mdsc = s->s_mdsc;

6355 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;

6356

6357 return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,

6358 challenge_buf, challenge_buf_len);

6359 }

6360

6361 static int mds_verify_authorizer_reply(struct ceph_connection *con)

6362 {

6363 struct ceph_mds_session *s = con->private;

6364 struct ceph_mds_client *mdsc = s->s_mdsc;

6365 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;

6366 struct ceph_auth_handshake *auth = &s->s_auth;

6367

6368 return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,

6369 auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,

6370 NULL, NULL, NULL, NULL);

6371 }

6372

6373 static int mds_invalidate_authorizer(struct ceph_connection *con)

6374 {

6375 struct ceph_mds_session *s = con->private;

6376 struct ceph_mds_client *mdsc = s->s_mdsc;

6377 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;

6378

6379 ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);

6380

6381 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);

6382 }

6383

6384 static int mds_get_auth_request(struct ceph_connection *con,

6385 void *buf, int *buf_len,

6386 void **authorizer, int *authorizer_len)

6387 {

6388 struct ceph_mds_session *s = con->private;

6389 struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;

6390 struct ceph_auth_handshake *auth = &s->s_auth;

6391 int ret;

6392

6393 ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,

6394 buf, buf_len);

6395 if (ret)

6396 return ret;

6397

6398 *authorizer = auth->authorizer_buf;

6399 *authorizer_len = auth->authorizer_buf_len;

6400 return 0;

6401 }

6402

6403 static int mds_handle_auth_reply_more(struct ceph_connection *con,

6404 void *reply, int reply_len,

6405 void *buf, int *buf_len,

6406 void **authorizer, int *authorizer_len)

6407 {

6408 struct ceph_mds_session *s = con->private;

6409 struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;

6410 struct ceph_auth_handshake *auth = &s->s_auth;

6411 int ret;

6412

6413 ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,

6414 buf, buf_len);

6415 if (ret)

6416 return ret;

6417

6418 *authorizer = auth->authorizer_buf;

6419 *authorizer_len = auth->authorizer_buf_len;

6420 return 0;

6421 }

6422

6423 static int mds_handle_auth_done(struct ceph_connection *con,

6424 u64 global_id, void *reply, int reply_len,

6425 u8 *session_key, int *session_key_len,

6426 u8 *con_secret, int *con_secret_len)

6427 {

6428 struct ceph_mds_session *s = con->private;

6429 struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;

6430 struct ceph_auth_handshake *auth = &s->s_auth;

6431

6432 return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,

6433 session_key, session_key_len,

6434 con_secret, con_secret_len);

6435 }

6436

6437 static int mds_handle_auth_bad_method(struct ceph_connection *con,

6438 int used_proto, int result,

6439 const int *allowed_protos, int proto_cnt,

6440 const int *allowed_modes, int mode_cnt)

6441 {

6442 struct ceph_mds_session *s = con->private;

6443 struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;

6444 int ret;

6445

6446 if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,

6447 used_proto, result,

6448 allowed_protos, proto_cnt,

6449 allowed_modes, mode_cnt)) {

6450 ret = ceph_monc_validate_auth(monc);

6451 if (ret)

6452 return ret;

6453 }

6454

6455 return -EACCES;

6456 }

6457

6458 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,

6459 struct ceph_msg_header *hdr, int *skip)

6460 {

6461 struct ceph_msg *msg;

6462 int type = (int) le16_to_cpu(hdr->type);

6463 int front_len = (int) le32_to_cpu(hdr->front_len);

6464

6465 if (con->in_msg)

6466 return con->in_msg;

6467

6468 *skip = 0;

6469 msg = ceph_msg_new(type, front_len, GFP_NOFS, false);

6470 if (!msg) {

6471 pr_err("unable to allocate msg type %d len %d\n",

6472 type, front_len);

6473 return NULL;

6474 }

6475

6476 return msg;

6477 }

6478

6479 static int mds_sign_message(struct ceph_msg *msg)

6480 {

6481 struct ceph_mds_session *s = msg->con->private;

6482 struct ceph_auth_handshake *auth = &s->s_auth;

6483

6484 return ceph_auth_sign_message(auth, msg);

6485 }

6486

6487 static int mds_check_message_signature(struct ceph_msg *msg)

6488 {

6489 struct ceph_mds_session *s = msg->con->private;

6490 struct ceph_auth_handshake *auth = &s->s_auth;

6491

6492 return ceph_auth_check_message_signature(auth, msg);

6493 }

6494

6495 static const struct ceph_connection_operations mds_con_ops = {

6496 .get = mds_get_con,

6497 .put = mds_put_con,

6498 .alloc_msg = mds_alloc_msg,

6499 .dispatch = mds_dispatch,

6500 .peer_reset = mds_peer_reset,

6501 .get_authorizer = mds_get_authorizer,

6502 .add_authorizer_challenge = mds_add_authorizer_challenge,

6503 .verify_authorizer_reply = mds_verify_authorizer_reply,

6504 .invalidate_authorizer = mds_invalidate_authorizer,

6505 .sign_message = mds_sign_message,

6506 .check_message_signature = mds_check_message_signature,

6507 .get_auth_request = mds_get_auth_request,

6508 .handle_auth_reply_more = mds_handle_auth_reply_more,

6509 .handle_auth_done = mds_handle_auth_done,

6510 .handle_auth_bad_method = mds_handle_auth_bad_method,

6511 };

6512

6513 /* eof */